[otbn] Properly parse optional operands for an instruction This shouldn't have any impact on generated documentation, but the YAML parser now actually understands what syntax like this means: <op0>[, <op1>][, X<op2>] It should match any of: r0 r0, r1 r0, Xr2 r0, r1, Xr2 We don't support nesting brackets (so there are always 2^k possible syntaxes for some k). We also don't check for ambiguity, so the ambiguous: <op0>[, <op1>][, <op2>] is accepted. In practice, we're going to parse assembly syntax with a regex, at least at first, which means the argument will be assigned to op1. But probably don't do that: it's rather confusing! Signed-off-by: Rupert Swarbrick <rswarbrick@lowrisc.org>

commit: 9337f21c8e0ea943b5f476d34a2cab14c362ab53 [log] [tgz]
author: Rupert Swarbrick <rswarbrick@lowrisc.org> Mon Jul 13 17:48:53 2020 +0100
committer: Rupert Swarbrick <rswarbrick@gmail.com> Fri Jul 17 16:38:57 2020 +0100
tree: 51fb02518d599321458df8a90c9901b4e5905a55
parent: c6bac1049be96a8db8bc017af11644f7df846947 [diff]
diff --git a/hw/ip/otbn/data/insns.yml b/hw/ip/otbn/data/insns.yml
index 4b0dbd8..9fe83a7 100644
--- a/hw/ip/otbn/data/insns.yml
+++ b/hw/ip/otbn/data/insns.yml

@@ -1207,7 +1207,7 @@
       - *mulqacc-wrs2-qwsel
       - *mulqacc-acc-shift-imm
     syntax: |
-      [<zero_acc>] <wrd><wrd_hwsel>,
+      [<zero_acc>] <wrd>.<wrd_hwsel>,
       <wrs1>.<wrs1_qwsel>, <wrs2>.<wrs2_qwsel>, <acc_shift_imm>
     glued-ops: true
     doc: |

diff --git a/hw/ip/otbn/util/insn_yaml.py b/hw/ip/otbn/util/insn_yaml.py
index 373bd52..ad6e61a 100644
--- a/hw/ip/otbn/util/insn_yaml.py
+++ b/hw/ip/otbn/util/insn_yaml.py

@@ -751,74 +751,222 @@
         self.doc = doc
 
 
+class SyntaxToken:
+    '''An object representing a single token in an instruction's syntax
+
+    See InsnSyntax for more details. The is_literal attribute is true if this
+    is a literal hunk of text (rather than an operand name). The text attribute
+    either holds the literal syntax or the operand name.
+
+    '''
+    def __init__(self, is_literal: bool, text: str) -> None:
+        assert text
+        self.is_literal = is_literal
+        # Make whitespace canonical for literals
+        self.text = re.sub(r'\s+', ' ', text) if is_literal else text
+
+    def render_doc(self) -> str:
+        '''Return how this syntax token should look in the documentation'''
+        if self.is_literal:
+            return self.text
+        else:
+            return '<{}>'.format(self.text)
+
+
+class SyntaxHunk:
+    '''An object representing a hunk of syntax that might be optional'''
+    def __init__(self,
+                 is_optional: bool,
+                 tokens: List[SyntaxToken],
+                 operands: Set[str]) -> None:
+        assert tokens
+        self.is_optional = is_optional
+        self.tokens = tokens
+        self.operands = operands
+
+    @staticmethod
+    def from_list(operands: List[str]) -> 'SyntaxHunk':
+        '''Smart constructor for a list of operands with "normal" syntax'''
+        assert operands
+        comma = SyntaxToken(True, ', ')
+        tokens = [SyntaxToken(False, operands[0])]
+        for op in operands[1:]:
+            tokens.append(comma)
+            tokens.append(SyntaxToken(False, op))
+
+        op_set = set(operands)
+        assert len(op_set) == len(operands)
+
+        return SyntaxHunk(False, tokens, op_set)
+
+    @staticmethod
+    def from_string(mnemonic: str, optional: bool, raw: str) -> 'SyntaxHunk':
+        '''Smart constructor that parses YAML syntax (see InsnSyntax)'''
+        print('from_string({!r}, {!r}, {!r})'.format(mnemonic, optional, raw))
+        assert raw
+
+        tokens = []
+        op_set = set()
+
+        parts = re.split(r'<([^>]+)>', raw)
+        for idx, part in enumerate(parts):
+            # The matches for the regex appear in positions 1, 3, 5, ...
+            is_literal = not (idx & 1)
+            if ('<' in part or '>' in part) and not is_literal:
+                raise ValueError("Syntax for {!r} has hunk {!r} which doesn't "
+                                 "seem to surround <operand>s properly."
+                                 .format(mnemonic, raw))
+
+            if not is_literal:
+                assert part
+                if part in op_set:
+                    raise ValueError("Syntax for {!r} has hunk {!r} with "
+                                     "more than one occurrence of <{}>."
+                                     .format(mnemonic, raw, part))
+                op_set.add(part)
+
+            # Only allow empty parts (and skip their tokens) if at one end or
+            # the other
+            if not part and idx not in [0, len(parts) - 1]:
+                raise ValueError("Syntax for {!r} has two adjacent operand "
+                                 "tokens, with no intervening syntax."
+                                 .format(mnemonic))
+
+            if part:
+                tokens.append(SyntaxToken(is_literal, part))
+
+        return SyntaxHunk(optional, tokens, op_set)
+
+    def render_doc(self) -> str:
+        '''Return how this hunk should look in the documentation'''
+        parts = []
+        for token in self.tokens:
+            parts.append(token.render_doc())
+
+        body = ''.join(parts)
+        return '[{}]'.format(body) if self.is_optional else body
+
+
 class InsnSyntax:
     '''A class representing the syntax of an instruction
 
-    The pairs attribute gives the operands to the instruction, together with
-    the text between them. For example, if the instruction had the following
-    form:
+    An instruction's syntax is specified in the YAML file by writing it out
+    with operand names surrounded by angle brackets. For example, a simple NOT
+    instruction might have a syntax of
 
-        my_mnemonic  (<op_0>, <op_1>), <op_2>
+        <dst>, <src>
 
-    then it would be represented as:
+    which should be interpreted as the following tokens:
 
-        prefix = '('
-        pairs = [('op_0', ', '),
-                 ('op_1', '), '),
-                 ('op_2', '')]
+        - Operand called 'dst'
+        - A literal ','
+        - Operand called 'src'
 
-    The operands attribute is just the set of operand names (in the example, it
-    would be {'op_0', 'op_1', 'op_2'}).
+    Between the tokens, whitespace is optional (so "r0 , r1" and "r0,r1" both
+    match the syntax above) unless a literal token is just a space, in which
+    case some whitespace is required. For example
+
+        <dst> <src>
+
+    would match "r0 r1" but not "r0r1". Whitespace within literal syntax tokens
+    means that some space is required, matching the regex \\s+. For example,
+    the (rather strange) syntax
+
+       <dst> + - <src>
+
+    would match "r0 + - r1" or "r0+ -r1", but not "r0 +- r1".
+
+    Some operands (and surrounding syntax) might be optional. The optional
+    syntax is surrounded by square brackets. Nesting is not supported. For
+    example:
+
+       <dst>, <src>[, <offset>]
+
+    would match "r0, r1, 123" or "r0, r1".
+
+    Note that a given syntax might be ambiguous. For example,
+
+       <dst>, <src>[, <offset>][, <flavour>]
+
+    With "r0, r1, 123", is 123 an offset or a flavour? (We choose not to embed
+    typing information into the syntax, because that results in very confusing
+    assembler error messages). We break ties in the same way as the underlying
+    regex engine, assigning the operand to the first group, so 123 is an offset
+    in this case. Such syntaxes are rather confusing though, so probably not a
+    good idea.
+
+    The parsed syntax is stored as a list of "hunks". Each hunk contains a flag
+    showing whether the hunk is optional or required and also a list of
+    SyntaxToken objects.
 
     '''
     def __init__(self,
-                 prefix: str,
-                 pairs: List[Tuple[str, str]],
+                 hunks: List[SyntaxHunk],
                  operands: Set[str]) -> None:
-        self.prefix = prefix
-        self.pairs = pairs
+        self.hunks = hunks
         self.operands = operands
 
-    def raw_string(self) -> str:
-        '''Return the raw string of the syntax'''
-        parts = [self.prefix]
-        for operand, suffix in self.pairs:
-            parts.append('<{}>'.format(operand))
-            parts.append(suffix)
-        return ''.join(parts)
-
     @staticmethod
     def from_list(operands: List[str]) -> 'InsnSyntax':
-        op_set = set(operands)
-        assert len(op_set) == len(operands)
-        return InsnSyntax('', [(op, ', ') for op in operands], op_set)
+        '''Smart constructor for a list of operands with "normal" syntax'''
+        if not operands:
+            return InsnSyntax([], set())
+
+        hunk = SyntaxHunk.from_list(operands)
+        return InsnSyntax([hunk], hunk.operands)
 
     @staticmethod
-    def from_yaml(raw: str) -> 'InsnSyntax':
+    def from_yaml(mnemonic: str, raw: str) -> 'InsnSyntax':
         '''Parse the syntax in the YAML file'''
 
-        # The raw syntax looks something like "<foo> + <bar> (baz <qux>)". We
-        # need to check that each <..> holds an operand name. Conveniently,
-        # re.split does exactly what we need, always yielding an odd number of
-        # parts and starting with an empty string if there's a match at the
-        # start.
-        parts = re.split(r'<([^>]+)>', raw)
-        prefix = parts[0]
-        pairs = list(zip(parts[1::2], parts[2::2]))
+        # The raw syntax looks something like
+        #
+        #    <op0>, <op1>[(<op2>)]
+        #
+        # to mean that you either have "r0, r1" or "r0, r2(r3)". First, split
+        # out the bracketed parts.
+        by_left = raw.split('[')
+        parts = [(False, by_left[0])]
+        for after_left in by_left[1:]:
+            split = after_left.split(']', 1)
+            if len(split) != 2:
+                raise ValueError('Unbalanced or nested [] in instruction '
+                                 'syntax for {!r}.'
+                                 .format(mnemonic))
 
-        assert len(parts) == 1 + 2 * len(pairs)
+            parts += [(True, split[0]), (False, split[1])]
 
-        # Collect up the named operands that we've seen, checking for
-        # duplicates
-        operands = set()  # type: Set[str]
-        for operand, _ in pairs:
-            if operand in operands:
-                raise ValueError('Instruction syntax ({!r}) has duplicate '
-                                 'occurrence of the {!r} operand.'
-                                 .format(raw, operand))
-            operands.add(operand)
+        # Now parts contains a list of pairs (required, txt) where txt is a
+        # hunk of the syntax and req is true if this hunk is required. A part
+        # might be empty. For example, "[a]b c[d]" with both lead and trail
+        # with an empty part. But it shouldn't be empty if it's marked
+        # optional: that would be something like "a[]b", which doesn't make
+        # much sense.
+        hunks = []
+        for optional, raw in parts:
+            if raw:
+                hunks.append(SyntaxHunk.from_string(mnemonic, optional, raw))
+            elif optional:
+                raise ValueError('Empty [] in instruction syntax for {!r}.'
+                                 .format(mnemonic))
 
-        return InsnSyntax(prefix, pairs, operands)
+        # Collect up operands across the hunks
+        op_count = 0
+        op_set = set()
+        for hunk in hunks:
+            op_count += len(hunk.operands)
+            op_set |= hunk.operands
+
+        if op_count != len(op_set):
+            raise ValueError('Instruction syntax for {!r} is not '
+                             'linear in its operands.'
+                             .format(mnemonic))
+
+        return InsnSyntax(hunks, op_set)
+
+    def render_doc(self) -> str:
+        '''Return how this syntax should look in the documentation'''
+        return ''.join(hunk.render_doc() for hunk in self.hunks)
 
 
 class EncodingField:
@@ -997,10 +1145,11 @@
 
         raw_syntax = get_optional_str(yd, 'syntax', what)
         if raw_syntax is not None:
-            self.syntax = InsnSyntax.from_yaml(raw_syntax)
+            self.syntax = InsnSyntax.from_yaml(self.mnemonic,
+                                               raw_syntax.strip())
         else:
-            op_name_list = [op.name for op in self.operands]
-            self.syntax = InsnSyntax.from_list(op_name_list)
+            self.syntax = InsnSyntax.from_list([op.name
+                                                for op in self.operands])
 
         # Make sure we have exactly the operands we expect.
         if set(self.name_to_operand.keys()) != self.syntax.operands:

diff --git a/hw/ip/otbn/util/yaml_to_doc.py b/hw/ip/otbn/util/yaml_to_doc.py
index 818119a..6c74818 100755
--- a/hw/ip/otbn/util/yaml_to_doc.py
+++ b/hw/ip/otbn/util/yaml_to_doc.py

@@ -170,7 +170,7 @@
     # Syntax example: either given explicitly or figured out from operands
     parts.append("```\n")
     parts.append(insn.mnemonic.upper() + ('' if insn.glued_ops else ' '))
-    parts.append(insn.syntax.raw_string())
+    parts.append(insn.syntax.render_doc())
     parts.append("\n```\n\n")
 
     # If this came from the RV32I instruction set, say so.
commit	9337f21c8e0ea943b5f476d34a2cab14c362ab53	[log] [tgz]
author	Rupert Swarbrick <rswarbrick@lowrisc.org>	Mon Jul 13 17:48:53 2020 +0100
committer	Rupert Swarbrick <rswarbrick@gmail.com>	Fri Jul 17 16:38:57 2020 +0100
tree	51fb02518d599321458df8a90c9901b4e5905a55
parent	c6bac1049be96a8db8bc017af11644f7df846947 [diff]