[util] Add Licence Checker

This commit adds a script which can checks the licence headers in our
source code against an expected licence header.

The script can check most text file formats that we have checked into
the repository, though it has some limitations. It ensures the entire
licence appears on consecutive lines in the first comment in the file,
and those lines contain nothing else.

The primary limitation of the checker is that each file suffix can only
match one comment style, which is used for checking for the licence
header. In text formats which accept multiple comment styles, there is
now a canonical one that the licence must use. Where available, the
licence should use a line comment style.

The other limitation is for files where the canonical style is block
comments, like `/* */`, each line must be wrapped in the comment prefix
and suffix, rather than the whole licence header being wrapped in a
single comment prefix and suffix. This is an artefact of how the checker
searches for the licence.

The checker is configured using a hjson file, which contains the exact
licence header, and a list of file patterns to exclude from checking the
licence for, which is used to exclude vendored and other externally
sourced files.

Signed-off-by: Sam Elliott <selliott@lowrisc.org>
diff --git a/util/licence-checker.hjson b/util/licence-checker.hjson
new file mode 100644
index 0000000..53efcfa
--- /dev/null
+++ b/util/licence-checker.hjson
@@ -0,0 +1,61 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+{
+  licence:
+    '''
+    Copyright lowRISC contributors.
+    Licensed under the Apache License, Version 2.0, see LICENSE for details.
+    SPDX-License-Identifier: Apache-2.0
+    ''',
+  exclude_paths: [
+    # Exclude anything in vendor directories
+    '*/vendor/*/*',
+
+    ## Hardware Exclusions
+
+    # DV Vendored test vectors
+    'hw/dv/sv/test_vectors',
+    # Vendored CRC model
+    'hw/dv/dpi/usbdpi/usb_crc.c',
+    # Vendored HMAC model
+    'hw/ip/hmac/dv/cryptoc_dpi/*',
+    # Vendored FPGA information
+    'hw/top_earlgrey/data/*.xdc',
+    'util/fpga/bram_load.mmi',
+    # Manually checked, files contain additional copyright header lines
+    'hw/ip/usb_fs_nb_pe/rtl/*',
+    'hw/ip/usbdev/rtl/usbdev_iomux.sv',
+    'hw/ip/usbuart/rtl/usb_serial_*_ep.sv',
+
+    ## Software Exclusions
+
+    # Coremark Vendored Files
+    'sw/device/benchmarks/coremark/top_earlgrey/*',
+    # Tock Vendored Files
+    'sw/device/tock/chips/opentitan_common/src/uart.rs',
+    'sw/device/tock/chips/opentitan_common/src/lib.rs',
+    'sw/device/tock/chips/opentitan_common/src/gpio.rs'
+    'sw/device/tock/chips/earlgrey/src/uart.rs',
+    'sw/device/tock/chips/earlgrey/src/timer.rs',
+    'sw/device/tock/chips/earlgrey/src/plic.rs',
+    'sw/device/tock/chips/earlgrey/src/lib.rs',
+    'sw/device/tock/chips/earlgrey/src/interrupts.rs',
+    'sw/device/tock/chips/earlgrey/src/gpio.rs',
+    'sw/device/tock/chips/earlgrey/src/chip.rs',
+    'sw/device/tock/boards/opentitan/src/main.rs',
+    'sw/device/tock/boards/opentitan/src/io.rs',
+    'sw/device/tock/boards/opentitan/layout.ld',
+    'sw/device/tock/boards/opentitan/build.rs',
+    'sw/device/tock/**/*.lock',
+    'sw/device/tock/boards/opentitan/rust-toolchain',
+
+    ## Other Exclusions
+
+    # Other Licences,
+    'util/wavegen/LICENSE.wavedrom',
+    # Site Assets
+    'site/**/assets/scss/**',
+    'site/landing/static/js/tiny-slider.js',
+  ],
+}
diff --git a/util/licence-checker.py b/util/licence-checker.py
new file mode 100755
index 0000000..70d6d17
--- /dev/null
+++ b/util/licence-checker.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+#
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import fnmatch
+import logging
+import subprocess
+from itertools import groupby
+from pathlib import Path
+from types import SimpleNamespace
+
+import hjson
+from tabulate import tabulate
+
+
+class LicenceHeader(object):
+    """Represents the licence header we want to insert"""
+    def __init__(self, text):
+        self._lines = text.strip().splitlines()
+
+    def __getitem__(self, idx):
+        return self._lines.__getitem__(idx)
+
+    def __len__(self):
+        return self._lines.__len__()
+
+    def numbered_lines(self, skip=0):
+        """Returns an iterator of (line_no, line_text).
+
+        `line_no` counts from 1, and is for humans to count line numbers with.
+        use `skip_lines` to skip enumerating the first few lines.
+        """
+        return enumerate(self._lines[skip:], start=1 + skip)
+
+    @property
+    def first_word(self):
+        (first_word, _) = self._lines[0].split(' ', 1)
+        return first_word
+
+
+class LineCommentStyle(object):
+    """Helpers for line-style comments."""
+    def __init__(self, prefix):
+        self.comment_prefix = str(prefix)
+        self.first_line_prefix = self.comment_prefix
+
+    def search_line(self, licence_first_word):
+        return self.comment_prefix + ' ' + licence_first_word
+
+    def expected_full_line(self, licence_line):
+        return self.comment_prefix + ' ' + licence_line
+
+
+class DifferentFirstLineCommentStyle(LineCommentStyle):
+    """Some files have a different allowable prefix for their first line."""
+    def __init__(self, prefix, first_line_prefix):
+        LineCommentStyle.__init__(self, prefix)
+        self.first_line_prefix = first_line_prefix
+
+
+class BlockCommentStyle(object):
+    """Helpers for block-style comments."""
+    def __init__(self, prefix, suffix):
+        self.comment_prefix = str(prefix)
+        self.comment_suffix = str(suffix)
+        self.first_line_prefix = self.comment_prefix
+
+    def search_line(self, licence_first_word):
+        return self.comment_prefix + ' ' + licence_first_word
+
+    def expected_full_line(self, licence_line):
+        return self.comment_prefix + ' ' + licence_line + ' ' + self.comment_suffix
+
+
+SLASH_SLASH = LineCommentStyle("//")
+HASH = LineCommentStyle("#")
+SLASH_STAR = BlockCommentStyle("/*", "*/")
+
+# (Priortised) Mapping of file name suffixes to CommentStyle object.
+# If the suffix of your file does not match one of these, it will not be
+# checked.
+#
+# These rules are given in priority order. Tuples of (extensions, style) higher
+# in the list are matched before those later in the list, on purpose.
+#
+# Files that do not match any extension, or which have a style of `None` are
+# not checked for a licence.
+COMMENT_CHARS = [
+    # Hardware Files
+    ([".svh", ".sv", ".sv.tpl"], SLASH_SLASH),  # SystemVerilog
+
+    # Hardware Build Systems
+    ([".tcl", ".sdc"], HASH),  # tcl
+    ([".core", ".core.tpl"],
+     DifferentFirstLineCommentStyle("#", "CAPI=2")),  # FuseSoC Core Files
+    (["Makefile", ".mk"], HASH),  # Makefiles
+    ([".ys"], HASH),  # Yosys script
+    ([".waiver"], HASH),  # AscentLint waiver files
+    ([".vlt"], SLASH_SLASH),  # Verilator configuration (waiver) files
+    ([".vbl"], HASH),  # Verible configuration files
+    ([".el", ".el.tpl"], SLASH_SLASH),  # Exclusion list
+    ([".f"], None),  # File lists
+
+    # The following two rules will inevitably bite us.
+    (["riviera_run.do"], HASH),  # Riviera dofile
+    ([".do"], SLASH_SLASH),  # Cadence LEC dofile
+
+    # Software Files
+    ([".c", ".c.tpl", ".h", ".h.tpl", ".cc", ".cpp"], SLASH_SLASH),  # C, C++
+    ([".S"], SLASH_SLASH),  # Assembly
+    ([".ld", ".ld.tpl"], SLASH_STAR),  # Linker Scripts
+    ([".rs"], SLASH_SLASH),  # Rust
+
+    # Software Build Systems
+    (["meson.build", "toolchain.txt", "meson_options.txt"], HASH),  # Meson
+
+    # General Tooling
+    ([".py"], HASH),  # Python
+    ([".sh"], HASH),  # Shell Scripts
+    (["Dockerfile"], HASH),  # Dockerfiles
+
+    # Configuration
+    ([".hjson", ".hjson.tpl"], SLASH_SLASH),  # hjson
+    ([".yml", ".yaml"], HASH),  # YAML
+    ([".toml"], HASH),  # TOML
+    (["-requirements.txt"], HASH),  # Apt and Python requirements files
+    (["redirector.conf"], HASH),  # nginx config
+
+    # Documentation
+    ([".md", ".md.tpl", ".html"], None),  # Markdown and HTML
+    ([".css"], SLASH_STAR),  # CSS
+    ([".scss"], SLASH_SLASH),  # SCSS
+
+    # Templates (Last because there are overlaps with extensions above)
+    ([".tpl"], HASH),  # Mako templates
+]
+
+
+def detect_comment_char(filename):
+    for (suffixes, commentstyle) in COMMENT_CHARS:
+        for suffix in suffixes:
+            if filename.endswith(suffix):
+                return commentstyle
+
+    return None
+
+
+def git_find_repo_toplevel():
+    git_output = subprocess.check_output(
+        ['git', 'rev-parse', '--show-toplevel'])
+    return Path(git_output.decode().strip()).resolve()
+
+
+def git_find_all_file_paths(top_level, search_paths):
+    git_output = subprocess.check_output(
+        ["git", "-C",
+         str(top_level), "ls-files", "-z", "--", *search_paths])
+    for path in git_output.rstrip(b"\0").split(b"\0"):
+        yield Path(top_level, path.decode())
+
+
+class ResultsTracker(object):
+    """Helper for tracking results"""
+    def __init__(self, base_dir):
+        self.base_dir = base_dir
+
+    passed_count = 0
+    failed_count = 0
+    excluded_count = 0
+    skipped_count = 0
+
+    failing_paths = set()
+
+    @property
+    def total_count(self):
+        return self.passed_count + self.failed_count + self.skipped_count + self.excluded_count
+
+    def passed(self, path, line_no, reason):
+        rel_path = path.relative_to(self.base_dir)
+        logging.debug("%s:%d PASSED: %s", str(rel_path), line_no, reason)
+        self.passed_count += 1
+
+    def failed(self, path, line_no, reason):
+        rel_path = path.relative_to(self.base_dir)
+        logging.error("%s:%d FAILED: %s", str(rel_path), line_no, reason)
+        self.failing_paths.add(rel_path)
+        self.failed_count += 1
+
+    def skipped(self, path, reason):
+        rel_path = path.relative_to(self.base_dir)
+        logging.info("%s: SKIPPED: %s", str(rel_path), reason)
+        self.skipped_count += 1
+
+    def excluded(self, path, reason):
+        rel_path = path.relative_to(self.base_dir)
+        logging.debug("%s: EXCLUDED: %s", str(rel_path), reason)
+        self.excluded_count += 1
+
+    def any_failed(self):
+        return self.failed_count > 0
+
+    def display_nicely(self):
+        headers = ["Results:", "Files"]
+        results = [["Passed", self.passed_count],
+                   ["Failed", self.failed_count],
+                   ["Skipped", self.skipped_count],
+                   ["Excluded", self.excluded_count],
+                   ["Total", self.total_count]]
+
+        return tabulate(results, headers, tablefmt="simple")
+
+
+def matches_exclude_pattern(config, file_path):
+    rel_path = str(file_path.relative_to(config.base_dir))
+    for exclude_pattern in config.exclude_paths:
+        if fnmatch.fnmatch(rel_path, exclude_pattern):
+            return True
+    return False
+
+
+def check_paths(config, git_paths):
+    results = ResultsTracker(config.base_dir)
+
+    for filepath in git_find_all_file_paths(config.base_dir, git_paths):
+        # Skip symlinks (with message)
+        if filepath.is_symlink():
+            results.excluded(filepath, "File is a symlink")
+            continue
+
+        # Skip non-file
+        if not filepath.is_file():
+            continue
+
+        # Skip exclude patterns
+        if matches_exclude_pattern(config, filepath):
+            results.excluded(filepath, "Path matches exclude pattern")
+            continue
+
+        check_file_for_licence(config.licence, results, filepath)
+
+    return results
+
+
+def check_file_for_licence(licence, results, filepath):
+    comment_style = detect_comment_char(filepath.name)
+
+    if comment_style is None:
+        results.skipped(filepath, "Unknown comment style")
+        return
+
+    if filepath.stat().st_size == 0:
+        results.skipped(filepath, "Empty file")
+        return
+
+    def next_line(file, line_no):
+        return (next(file).rstrip(), line_no + 1)
+
+    with filepath.open() as f:
+        licence_assumed_start = None
+
+        # Get first line
+        try:
+            line, line_no = next_line(f, 0)
+        except StopIteration:
+            results.failed(filepath, 1, "Empty file")
+            return
+
+        licence_search_marker = comment_style.search_line(licence.first_word)
+
+        # Check first line against the first word of licence, or against a
+        # possible different first line.
+        if not line.startswith(licence_search_marker):
+            if not line.startswith(comment_style.first_line_prefix):
+                results.failed(filepath, line_no,
+                               "File does not start with comment")
+                return
+
+            try:
+                line, line_no = next_line(f, line_no)
+            except StopIteration:
+                results.failed(filepath, line_no,
+                               "Reached end of file before finding licence")
+                return
+
+        # Skip lines that don't seem to be the first line of the licence
+        while not line.startswith(licence_search_marker):
+            try:
+                line, line_no = next_line(f, line_no)
+            except StopIteration:
+                results.failed(filepath, line_no,
+                               "Reached end of file before finding licence")
+                return
+
+            if not line.startswith(comment_style.comment_prefix):
+                results.failed(filepath, line_no,
+                               "First comment ended before licence notice")
+                return
+
+        # We found the marker, so we found the first line of the licence.
+        # The current line is in the first comment, so check the line matches the
+        # expected first line:
+        licence_assumed_start = line_no
+        if line != comment_style.expected_full_line(licence[0]):
+            results.failed(filepath, line_no, "Licence does not match")
+            return
+
+        for (licence_line_no, licence_line) in licence.numbered_lines(skip=1):
+            try:
+                line, line_no = next_line(f, line_no)
+            except StopIteration:
+                results.failed(filepath, line_no,
+                               "Reached end of file before finding licence")
+                return
+
+            # Check against full expected line.
+            if line != comment_style.expected_full_line(licence_line):
+                results.failed(filepath, line_no, "Licence did not match")
+                return
+
+    results.passed(filepath, licence_assumed_start, "Licence found")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=
+        "A tool to check the lowRISC licence header is in each source file")
+    parser.add_argument("--config",
+                        metavar="config.hjson",
+                        type=argparse.FileType('r', encoding='UTF-8'),
+                        help="HJSON file to read for licence configuration.")
+    parser.add_argument("paths",
+                        metavar="path",
+                        nargs='*',
+                        default=["."],
+                        help="Paths to check for licence headers.")
+    parser.add_argument('-v',
+                        "--verbose",
+                        action='store_true',
+                        dest='verbose',
+                        help="Verbose output")
+
+    options = parser.parse_args()
+
+    if options.verbose:
+        logging.basicConfig(format="%(levelname)s: %(message)s",
+                            level=logging.INFO)
+    else:
+        logging.basicConfig(format="%(levelname)s: %(message)s")
+
+    config = SimpleNamespace()
+    config.base_dir = git_find_repo_toplevel()
+
+    parsed_config = hjson.load(options.config)
+
+    config.licence = LicenceHeader(parsed_config['licence'])
+    config.exclude_paths = set(parsed_config['exclude_paths'])
+
+    results = check_paths(config, options.paths)
+
+    print(results.display_nicely())
+
+    if results.any_failed():
+        print("Failed:")
+        for path in results.failing_paths:
+            print("  {}".format(str(path)))
+        print("")
+        exit(1)
+    else:
+        exit(0)
+
+
+if __name__ == '__main__':
+    main()