blob: cec009438d30f5522cbea7392f446a1bcee72af2 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright lowRISC contributors.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import tomli
import re
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--debug",
help = "print debugging information (very verbose)",
action="store_true")
parser.add_argument("--debug-only-file",
help = "only process a particular file (helpful to debug)",)
parser.add_argument("--debug-only-line",
help = "only process a particular file (helpful to debug)",)
parser.add_argument("--apply-suggestions",
help = "apply all the fixes suggested by the tools",
action="store_true")
parser.add_argument("--pedantic-reldir",
help = "be especially pedantic about relative path using ./",
action="store_true")
parser.add_argument("root_doc",
help = "path to the root of documentation")
parser.add_argument("--warn-internal-weblinks",
help = "warn about links to internal documentation using docs.opentitan.org URLs",
action=argparse.BooleanOptionalAction,
default = True)
parser.add_argument("--warn-unlisted-files",
help = "warn about md files that are not listed in SUMMARY.md",
action=argparse.BooleanOptionalAction,
default = True)
parser.add_argument("--warn-hugo-links",
help = "warn about links using the old Hugo format",
action=argparse.BooleanOptionalAction,
default = True)
parser.add_argument("--move-list-file",
help = "provide a list of moved files to help fix the links")
parser.add_argument("--ignore-book",
help = "do not search for mdbook and just check all markdown files in a directory",
action="store_true")
g_args = parser.parse_args()
def debug(msg):
if g_args.debug:
print("[debug] " + msg)
def parse_move_list_file(path):
if not path.exists() or not path.is_file():
print(f"error: move file list {path} does not exist or is not a file")
return None
with path.open() as f:
res = {}
line_nr = 0
for line in f:
line_nr += 1
if line.strip() == "":
continue
# ignore empty lines and split at whitespace
arr = line.strip().split()
if len(arr) != 2:
print("move file list f{path}: f{line_nr} has the wrong format and will be ignored")
continue
res[arr[0]] = arr[1]
return res
g_move_list_file = None
if g_args.move_list_file:
g_move_list_file = parse_move_list_file(Path(g_args.move_list_file))
class Book:
"""
Represents an mdbook
"""
def __init__(self, path):
"""
Load a book by pointing to the directory containing a book.toml
This is will build the list of files listed in SUMMARY.md
and also build the list of all md files under this directory
"""
self.path = path.resolve()
self.mdbook_defaults = ["README.md", "index.md"]
self.site_url = "NO_URL"
global g_args
if g_args.ignore_book:
# create a fake book
self.srcdir = path
else:
self._load_book(path)
# build list of all files that appear in the subdirectory
# for each file we will eventually set to True if they are listed in summary
self.allmdfiles = {}
def addfile(f):
name = self.local_name(f)
self.allmdfiles[name] = False
self.apply_to_all_files(addfile)
# load summary
if not g_args.ignore_book:
self._load_summary()
def _load_book(self, path):
toml = self.path / "book.toml"
assert toml.exists(), "there is not book.toml"
toml_dict = tomli.loads(toml.read_bytes().decode("utf-8"))
self.srcdir = (path / toml_dict["book"]["src"]).resolve()
if "output" in toml_dict and "html" in toml_dict["output"]:
self.site_url = toml_dict["output"]["html"].get("site-url", self.site_url)
def _load_summary(self):
# load summary and mark all links as true, ignore all errors for now
for (line, _, link) in Book.list_links_in_file(self.srcdir / "SUMMARY.md"):
name = self.local_name(self.srcdir / link)
# we only check .md files, SUMMARY.md can include more types of files
# with preprocessors
if name.name.endswith(".md"):
# if ever SUMMARY.md contains an invalid link, ignore it (it will reported by the general check)
if name in self.allmdfiles:
# assert name in self.allmdfiles, f"book at {self.path}: link {link} in summary is invalid (normalized to {str(name)})"
self.allmdfiles[name] = True
def __repr__(self):
return f"Book({repr(self.path)})"
def __str__(self):
return f"book at {str(self.path)}, src at {str(self.srcdir)}, url at {str(self.site_url)}"
def local_name(self, abspath):
"""
Given an absolute Path to a file under the source directory of the book,
return a relative Path to the source directory, return None in case of error
"""
try:
return abspath.resolve().relative_to(self.srcdir)
except ValueError:
return None
def apply_to_all_files(self, fn):
"""
List all files under the directory of the book and call a function on each one
"""
def apply_rec(path):
for child in path.iterdir():
if child.name.endswith(".md"):
fn(child)
if child.is_dir():
apply_rec(child.resolve())
apply_rec(self.path)
def list_links_in_file(abspath):
"""
Return a list of all the links in a file (given by its absolute Path)
Each entry is a tuple of the form (line_nr, (start, end), link)
indicating the line number, the character range within the line and the link itself
"""
cm_re = r"\[[^\]]*\]\(\s*(?P<url>[^)]*)\s*\)"
# we also look for non-markdown-but-Hugo-relref
hugo_relref_re = r"(?P<hugo>\{\{<\s*relref\s*\"[^\"]*\"\s*>\}\})"
prog = re.compile(f"{cm_re}|{hugo_relref_re}")
links = []
with abspath.open() as f:
line_nr = 0
for line in f:
line_nr += 1
for x in prog.finditer(line):
grp = "url" if x.group("url") is not None else "hugo"
posrange = (x.start(grp), x.end(grp))
links.append((line_nr, posrange, x[grp]))
return links
def is_file_in_summary(self, path):
"""
Check whether a file (given by its absolute Path) is in the SUMMARY.md file of the book
"""
# if the file does not exists, it cannot be in summary
if not path.exists() or not path.is_file():
return False
local_name = self.local_name(path)
# if it points to an md file, it must be in the summary
assert path.name.endswith(".md"), f"this function only works for md files, called on {path}"
assert local_name in self.allmdfiles, f"link to \"{path}\" normalised to {local_name} not in list"
return self.allmdfiles[local_name]
def path_exists_or_was_moved(self, path, reverse_move=False):
"""
This function takes a Path and returns a Path to the file, which could be potentially
different if the file was moved. This function handles file moves as recorded in the
move file list. It returns None if it cannot resolve it to an *existing* file or directory.
"""
global g_move_list_file
move_list_file = g_move_list_file
if move_list_file is None:
move_list_file = {}
# reverse map if asked
if reverse_move:
res = {}
for (from_, to) in move_list_file.items():
res[to] = from_
move_list_file = res
# first check if it was moved because it could be that a file was moved
# and later replaced by another file with different content: the move takes priority
local_name = self.local_name(path)
debug(f"CHECK MOVE {path}, normalized to {local_name}, exists {path.exists()}, reverse {reverse_move}, moved {str(local_name) in move_list_file}")
if local_name is None:
return None
if str(local_name) in move_list_file:
moved_to = move_list_file[str(local_name)]
assert moved_to[0] != '/', "the target of a move cannot be an absolute path"
newpath = self.srcdir / moved_to
# we only check new path existence for non-reverse moved
if reverse_move or newpath.exists():
debug(f"move detected {path}, normalize to {local_name}, now {newpath}")
return newpath
if path.exists():
return path
return None
def _resolve_existing_path(self, path, report):
"""
Given a Path to an existing file/directory, figure out what is the intended markdown target.
This handles for example a link to a directory that contains an README.md.
This returns a Path to the intended markdown file, or None if none could be found.
When retuning None, it will report the error using the report function.
"""
# if it points to a directory, we try to resolve it to README.md or index.md
if path.is_dir():
for subfile in self.mdbook_defaults:
if (path / subfile).exists():
return path / subfile
report(
f"link to directory \"{self.local_name(path)}\" but cannot find a valid subfile, tried {self.mdbook_defaults}")
return None
else:
return path
def resolve_relative_link(self, local_filename, link, relative_to, report):
"""
Given a link (a string) in a file (represented by its relative Path to the source directory),
try to find the Path to the intended markdown target and return it.
If none can be found, returns None. Any error/warning will reporting via the report function.
This function will handle things like pointing to directories, using index.html instead of README.md
and other oddities.
"""
# if the link starts with a '/', this will mess with pathlib because bla / link will just produce bla
# but we want the link to relative to what was specified, so remove the initial '/' if needed
while len(link) > 0 and link[0] == '/':
link = link[1:]
debug(f"resolving {link} relative to {local_filename}")
# there are several cases:
# - points to a file
# - points to a directory
path = (relative_to / link).resolve()
if (newpath := self.path_exists_or_was_moved(path)) is not None:
return self._resolve_existing_path(newpath, report)
debug("direct resolution to a moved file has failed")
# if the link ends with index.html, and the corresponding index.md or README.md file exists
# then resolve it to that file
index_html = "index.html"
if link.endswith(index_html):
for subfile in self.mdbook_defaults:
new_link = link[:-len(index_html)] + subfile
path = (relative_to / new_link).resolve()
if (newpath := self.path_exists_or_was_moved(path)) is not None:
# FIXME should we check that this is in the summary?
report(
f"link to HTML file \"{link}\", resolved to \"{self.local_name(newpath)}\", which is a local file")
return path
# no solution found
report(
f"link to \"{link}\", resolved to \"{self.local_name(path)}\", which does not exists")
return None
def resolve_local_link(self, local_filename, link, report):
return self.resolve_relative_link(local_filename, link, (self.srcdir / local_filename).parent, report)
def resolve_global_link(self, local_filename, link, report):
return self.resolve_relative_link(local_filename, link, self.srcdir, report)
def find_all_books(path):
"""
Return a list of Book constructed from the book.toml files that can be
found under path (recursively).
"""
books = []
for child in path.iterdir():
if child.is_dir():
books.extend(find_all_books(child))
if child.is_file() and child.name == "book.toml":
books.append(Book(path))
return books
def relative_link_to(relpath, relative_to_dir):
"""
Given an absolute Path to a file (or directory) and an absolute Path to a directory,
return a relative path to reach the file from the directory.
For example relative_link_to(/path/to/some/very/deep/file/hello.md, /path/to/a/shallow/dir)
returns ../../../some/very/deep/file/hello.md
"""
parent = relative_to_dir.resolve()
pref = "./"
while True:
try:
out = relpath.relative_to(parent)
return pref + str(out)
except ValueError:
parent = parent.parent
if pref == "./":
pref = "../"
else:
pref += "../"
print(
f"ERROR couldn't create link for {relpath} relative to {relative_to_dir.resolve()}, this should never happen")
return None
# decide when to suggest a link rewriting
# the main to detect here is that some links are of theform x
# but the canonical link would be just "./x"
# at this stage we don't want to bother the use with such nonsense
def should_rewrite_link(newlink, oldlink):
if oldlink == newlink:
return False
if newlink == "./" + oldlink and not g_args.pedantic_reldir:
return False
return True
def try_several_resolutions(book, local_filename, link, process_list, report):
"""
This function tries to resolve a link is several different ways by
calling the various functions in the processing list. To make the
user output readable, it will not forward any of the report/suggestions
unless the list has size 1 or the function suceeds. If no function suceeds,
it will print error message saying that
no rule applied.
"""
if (process_list) == 1:
return process_list[0](book, local_filename, link, report)
# try them one by one
for fn in process_list:
action_list = []
def store_report(*args):
action_list.append(("report", *args))
out = fn(book, local_filename, link, store_report)
if out is None:
continue
# do actions
for act in action_list:
if act[0] == "report":
report(*act[1:])
else:
assert False
return out
report(f"could not resolve \"{link}\"")
return None
def process_path(book, local_filename, path, report):
# check that path is in the summary (for md files)
global g_args
if path.name.endswith(".md") and not book.is_file_in_summary(path) and not g_args.ignore_book:
report(f"link to \"{book.local_name(path)}\", which is not listed in SUMMARY.md")
# at the moment, report the error but consider this a good link nonetheless
return path # this link is ok
def _process_relative_link(book, local_filename, link, report):
# at this point, we know this is a local link and we want to check it
# but also suggest some changes if we can figure out how to fix/lint it
# the following function tries to resolve where this link should point to
# then we compute how it should be written in the file
# and if the result is different from the original, we suggest a rewrite
debug(f"processing local link {link}")
path = book.resolve_local_link(local_filename, link, report)
debug(f"local link resolution gave {path}")
if path is None:
# cannot resolve link, assume that resolve_local_link reported the error
return
return process_path(book, local_filename, path, report)
def __process_hugo_relref(book, local_filename, link, report, local_search):
# relref can either be relative path to this file or relative path
# to the root of the documentation
debug(f"internal processing hugo link {link}")
if local_search:
if link.startswith('/'):
return None
return _process_relative_link(book, local_filename, link, report)
else:
debug(f"try to resolve global link {link}")
path = book.resolve_global_link(local_filename, link, report)
if path is None:
# cannot resolve link, assume that resolve_global_link reported the error
return None
return process_path(book, local_filename, path, report)
def _process_hugo_relref(book, local_filename, link, report):
# Hugo references can do things like pointing to a file bla.md just by writing bla
# this is makes it quite tricky since when we start mixing this up with move file
# rewriting or guess of directories, there is no easy way to detect when we should
# add .md to the link; similarly a link can point to a directory x/ in which case
# it should be understood as pointing to x/_index.md or x/index.md
# the solution is just to try with and without it
debug(f"processing hugo relref {link}")
def add_md(book, local_filename, link, report, local_search):
if link.endswith(".md"):
return None
return __process_hugo_relref(book, local_filename, link + ".md", report, local_search)
def add_subfile(book, local_filename, link, report, subfile, local_search):
if link.endswith(".md"):
return None
suff = subfile if link.endswith('/') else '/' + subfile
return __process_hugo_relref(book, local_filename, link + suff, report, local_search)
# not that we try add .md only if the first (normal) resolution fails
# following Hugo's documentation, we first try to resolve it relatively
# to this file and then relatively to the repo (unless it starts with /
# in which case it cannot be a relative link)
processes = [
# LOCAL search
lambda *arg: __process_hugo_relref(*arg, True), # normal processing
lambda *arg: add_md(*arg, True), # try to add .md add the end
lambda *arg: add_subfile(*arg, "index.md", True), # try to add /index.md
lambda *arg: add_subfile(*arg, "_index.md", True), # try to add /_index.md
# GLOBAL search
lambda *arg: __process_hugo_relref(*arg, False), # normal processing
lambda *arg: add_md(*arg, False), # try to add .md add the end
lambda *arg: add_subfile(*arg, "index.md", False), # try to add /index.md
lambda *arg: add_subfile(*arg, "_index.md", False), # try to add /_index.md
]
return try_several_resolutions(book, local_filename, link, processes, report)
def split_anchor_and(book, local_filename, link, process, report):
arr = link.split("#")
if len(arr) > 2:
report(f"link \"{link}\" is weird, giving up on that one")
return
base = arr[0]
anchor = "#" + arr[1] if len(arr) == 2 else ''
if base == "":
return
debug(f"split link to base \"{base}\" and fragment \"{anchor}\"")
path = process(book, local_filename, base, report)
if path is None:
return None
else:
return (path, anchor)
def process_relative_link(book, local_filename, link, report):
return split_anchor_and(book, local_filename, link, _process_relative_link, report)
def process_hugo_relref(book, local_filename, link, report):
return split_anchor_and(book, local_filename, link, _process_hugo_relref, report)
def show_unlisted_files(book):
print(f"#### {book.path} ####")
# check files not in summary
header_printed = False
for (name, in_summary) in book.allmdfiles.items():
# obviously ignore SUMMARY.md
if not in_summary and name.name != "SUMMARY.md":
if not header_printed:
print("Files not listed in SUMMARY.md:")
header_printed = True
print(f"- {name}")
# we will store the list of all suggestions and rewrite everything at the end
# for extra safety, when recording a suggestion, we not only memorize what
# we should change to, but also what we are changing from
# when doing the changes, we will check that this matches
g_suggestions = {}
def add_suggestion(path, line_nr, pos_range, oldvalue, newvalue):
path = path.resolve()
ff = g_suggestions.get(path, {})
ll = ff.get(line_nr, {})
ll[pos_range] = (oldvalue, newvalue)
ff[line_nr] = ll
g_suggestions[path] = ff
# apply all suggestions in a file
def apply_suggestions(filename, suggestions):
out_content = ""
with filename.open() as f:
print(filename)
line_nr = 0
for line in f:
line_nr += 1
if line_nr in suggestions:
pos_delta = 0
for ((start, end), (oldvalue, newvalue)) in sorted(suggestions[line_nr].items(), key=lambda x: x[0][0]):
# adjust
start += pos_delta
end += pos_delta
# we make sure that the bit that we are replacing matches what it is supposed to replace
# this is just for extra safety, if there is no bug this assert will always hold
assert line[start:
end] == oldvalue, f"mismatch between expected old value ({oldvalue}) and actual old value ({line[start:end]})"
line = line[:start] + newvalue + line[end:]
# change delta
pos_delta += len(newvalue) - len(oldvalue)
out_content += line
filename.write_text(out_content)
def check_all_links(book):
global g_args
hugo_relref_prog = re.compile(r"^\{\{<\s*relref\s*\"(?P<url>[^\"]*)\"\s*>\}\}$")
print(f"#### {book.path} ####")
# for all listed file, check links
for (name, in_summary) in book.allmdfiles.items():
# debugging check
if g_args.debug_only_file is not None:
local_only_name = book.local_name(Path(g_args.debug_only_file))
if local_only_name != name:
continue # skip file
# we ignore files that are not in the summary (but we do check SUMMARY.md for bad links)
# warning: the check for SUMMARY.md needs to be sure to only consider SUMMARY.md at the root
# of the book, there could be others SUMMARY.md in the tree that are unrelated, we don't want to
# include those if they are not listed
if not g_args.ignore_book and not in_summary and name == Path("SUMMARY.md"):
continue # skip file
# get links
header_printed = False
for (line_nr, posrange, link) in Book.list_links_in_file(book.srcdir / name):
# debugging check
oldname = None
if g_args.debug_only_line is not None:
only_line = int(g_args.debug_only_line)
if line_nr != only_line:
continue # skip line
def suggest(newvalue):
add_suggestion(book.srcdir / name, line_nr, posrange, link, newvalue)
def report(msg):
# it's global because the whole code is not in a function and nonlocal does not work in this case
nonlocal header_printed
if not header_printed:
if oldname is not None and oldname != name:
print(f"In {name} (previously {oldname}):")
else:
print(f"In {name}:")
header_printed = True
print(f"- line {line_nr}: {msg}")
def maybe_suggest_rewrite(res):
if res is None:
return
(path, fragment) = res
# this link is good but maybe it is not written in the canonical way, warn about that
relpath = relative_link_to(path, (book.srcdir / name).parent)
newlink = str(relpath) + fragment
if should_rewrite_link(newlink, link):
report(f"link \"{link}\" resolved to \"{newlink}\"")
report(f"SUGGEST rewrite link to \"{newlink}\"")
suggest(newlink)
# link to internal doc that use the opentitan url
if link.startswith("https://docs.opentitan.org/"):
if g_args.warn_internal_weblinks:
report(f"link {link} points to internal document, use URL_ROOT or local link")
continue
# if this file was moved, we need to make sure that the relative links that we compute
# are relative to the old position of the file
oldname = book.local_name(book.path_exists_or_was_moved(book.srcdir / name, True))
if oldname != name:
debug(f"file {name} was previously {oldname}")
# old Hugo links
if link.startswith("{{<") or link.startswith("{{%"):
# this tool only understands a tiny subset of those
x = hugo_relref_prog.match(link)
if x is not None:
res = process_hugo_relref(book, oldname, x["url"], report)
maybe_suggest_rewrite(res)
elif g_args.warn_hugo_links:
report(f"link \"{link}\" uses old Hugo syntax")
continue
# URL outside of our doc or mailto
if link.startswith("https://") or link.startswith("mailto:"):
# ignore
continue
# unsecure url
if link.startswith("http://"):
report(f"url {link} is not secure")
continue
# cross-book links, not checking those yet
if link.startswith("{{URL_ROOT}}"):
report(f"link \"{link}\" NOT CHECKED")
continue
# try to resolve the path and suggest a rewriting if necessary
res = process_relative_link(book, oldname, link, report)
maybe_suggest_rewrite(res)
g_root_doc = Path(g_args.root_doc).resolve()
# if root doc is a link to a book.toml file, just open this one
if g_root_doc.is_file():
assert g_root_doc.name == "book.toml", "you must give a path a book.toml file or a directory"
g_book_list = [Book(g_root_doc.parent)]
elif g_args.ignore_book:
# create a fake book that contains everything
g_book_list = [Book(g_root_doc)]
else:
g_book_list = find_all_books(g_root_doc)
print("books:")
for p in g_book_list:
print("- {}".format(p))
# find all md files in the tree that are not listed in any SUMMARY.md
if not g_args.ignore_book and g_args.warn_unlisted_files:
for book in g_book_list:
show_unlisted_files(book)
for book in g_book_list:
check_all_links(book)
print("### Summary of suggestions ###")
for (filename, per_file) in g_suggestions.items():
print(f"In {filename}:")
for (line_nr, per_line) in per_file.items():
for (posrange, (oldvalue, newvalue)) in per_line.items():
print(f"- line {line_nr},{posrange[0]}-{posrange[1]}: \"{oldvalue}\" -> \"{newvalue}\"")
if g_args.apply_suggestions:
for (filename, per_file) in g_suggestions.items():
apply_suggestions(filename, per_file)