| # Copyright lowRISC contributors. |
| # Licensed under the Apache License, Version 2.0, see LICENSE for details. |
| # SPDX-License-Identifier: Apache-2.0 |
| |
| '''Code for making sense of instruction syntax as defined in insns.yml''' |
| |
| import re |
| from typing import Dict, List, Set, Tuple |
| |
| from .operand import Operand |
| |
| |
| class SyntaxToken: |
| '''An object representing a single token in an instruction's syntax |
| |
| See InsnSyntax for more details. The is_literal attribute is true if this |
| is a literal hunk of text (rather than an operand name). The text attribute |
| either holds the literal syntax or the operand name. |
| |
| ''' |
| def __init__(self, is_literal: bool, text: str) -> None: |
| assert text |
| self.is_literal = is_literal |
| # Make whitespace canonical for literals |
| self.text = re.sub(r'\s+', ' ', text) if is_literal else text |
| |
| def render_doc(self) -> str: |
| '''Return how this syntax token should look in the documentation''' |
| if self.is_literal: |
| return self.text |
| else: |
| return '<{}>'.format(self.text) |
| |
| def asm_pattern(self) -> str: |
| '''Return a regex pattern that can be used for matching this token |
| |
| If the token represents an operand, the pattern is wrapped in a group |
| (to capture the operand). For more details about the syntax, see |
| InsnSyntax. |
| |
| ''' |
| if self.is_literal: |
| # A literal that is pure whitespace "requires the whitespace". |
| # Otherwise, replace all internal whitespace with \s+ and allow |
| # optional whitespace afterwards. To do this easily, we split the |
| # literal on whitespace. The result is empty iff it was just |
| # whitespace in the first place. |
| words = self.text.split() |
| if not words: |
| return r'\s+' |
| |
| # For non-whitespace literals, we disallow leading space and add |
| # optional trailing space. This convention should avoid lots of |
| # \s*\s* pairs. |
| parts = [re.escape(words[0])] |
| for w in words[1:]: |
| parts.append(r'\s+') |
| parts.append(re.escape(w)) |
| parts.append(r'\s*') |
| |
| return ''.join(parts) |
| |
| # Otherwise, this is an operand. For now, at least, we're very |
| # restrictive for operands. No spaces and no commas (the second rule |
| # avoids silliness like "a, b, c" matching a syntax with only two |
| # operands by setting the second to "b, c"). |
| # |
| # We also split out ++ and -- separately, to disambiguate things like |
| # x1++, which must be parsed as x1 followed by ++. |
| # |
| # However, we do want to allow things like ".+123". To get this right, |
| # suppose S matches any character other than elements of " ,+-". Then |
| # we can use a regex like "-?S(\+?-?S)*". This avoids two consecutive + |
| # or - signs. It also allows .+-3 (i.e. the current PC minus 3). It |
| # doesn't allow .-+3, but we probably don't care. |
| # |
| # If we want to do better and allow things like |
| # |
| # addi x0, x1, 1 + 3 |
| # |
| # then we need to use something more serious than just regexes for |
| # parsing. |
| s_re = r'[^ ,+\-]' |
| not_inc_or_dec = ''.join([r'(?:-?', s_re, r'(?:\+?-?', s_re, r')*)']) |
| return ''.join([r'(\+\+|--|', not_inc_or_dec, r')\s*']) |
| |
| def render(self, |
| cur_pc: int, |
| op_vals: Dict[str, int], |
| operands: Dict[str, Operand]) -> str: |
| '''Generate an assembly listing for this syntax token |
| |
| If the syntax token is an operand, that operand is retrieved from |
| op_vals and rendered. |
| |
| ''' |
| if self.is_literal: |
| return self.text |
| |
| assert self.text in op_vals |
| assert self.text in operands |
| |
| op_type = operands[self.text].op_type |
| return op_type.op_val_to_str(op_vals[self.text], cur_pc) |
| |
| |
| class SyntaxHunk: |
| '''An object representing a hunk of syntax that might be optional''' |
| def __init__(self, |
| is_optional: bool, |
| tokens: List[SyntaxToken], |
| op_list: List[str], |
| op_set: Set[str]) -> None: |
| assert tokens |
| self.is_optional = is_optional |
| self.tokens = tokens |
| self.op_list = op_list |
| self.op_set = op_set |
| |
| @staticmethod |
| def from_list(operands: List[str]) -> 'SyntaxHunk': |
| '''Smart constructor for a list of operands with "normal" syntax''' |
| assert operands |
| comma = SyntaxToken(True, ', ') |
| tokens = [SyntaxToken(False, operands[0])] |
| for op in operands[1:]: |
| tokens.append(comma) |
| tokens.append(SyntaxToken(False, op)) |
| |
| op_set = set(operands) |
| assert len(op_set) == len(operands) |
| |
| return SyntaxHunk(False, tokens, operands, op_set) |
| |
| @staticmethod |
| def from_string(mnemonic: str, optional: bool, raw: str) -> 'SyntaxHunk': |
| '''Smart constructor that parses YAML syntax (see InsnSyntax)''' |
| assert raw |
| |
| tokens = [] |
| op_list = [] |
| op_set = set() |
| |
| parts = re.split(r'<([^>]+)>', raw) |
| for idx, part in enumerate(parts): |
| # The matches for the regex appear in positions 1, 3, 5, ... |
| is_literal = not (idx & 1) |
| if ('<' in part or '>' in part) and not is_literal: |
| raise ValueError("Syntax for {!r} has hunk {!r} which doesn't " |
| "seem to surround <operand>s properly." |
| .format(mnemonic, raw)) |
| |
| if not is_literal: |
| assert part |
| if part in op_set: |
| raise ValueError("Syntax for {!r} has hunk {!r} with " |
| "more than one occurrence of <{}>." |
| .format(mnemonic, raw, part)) |
| op_list.append(part) |
| op_set.add(part) |
| |
| # Only allow empty parts (and skip their tokens) if at one end or |
| # the other |
| if not part and idx not in [0, len(parts) - 1]: |
| raise ValueError("Syntax for {!r} has two adjacent operand " |
| "tokens, with no intervening syntax." |
| .format(mnemonic)) |
| |
| if part: |
| tokens.append(SyntaxToken(is_literal, part)) |
| |
| return SyntaxHunk(optional, tokens, op_list, op_set) |
| |
| def render_doc(self) -> str: |
| '''Return how this hunk should look in the documentation''' |
| parts = [] |
| for token in self.tokens: |
| parts.append(token.render_doc()) |
| |
| body = ''.join(parts) |
| return '[{}]'.format(body) if self.is_optional else body |
| |
| def asm_pattern(self) -> str: |
| '''Return a regex pattern that can be used for matching this hunk |
| |
| The result will have a group per operand. It allows trailing, but not |
| leading, space within the hunk. |
| |
| ''' |
| parts = [] |
| for token in self.tokens: |
| parts.append(token.asm_pattern()) |
| body = ''.join(parts) |
| |
| # For an optional hunk, we build it up in the form "(?:foo)?". This |
| # puts a non-capturing group around foo and then applies "?" |
| # (one-or-more) to it. |
| return '(?:{})?'.format(body) if self.is_optional else body |
| |
| def render(self, |
| cur_pc: int, |
| op_vals: Dict[str, int], |
| operands: Dict[str, Operand]) -> str: |
| '''Return an assembly listing for the hunk given operand values |
| |
| If this hunk is optional and all its operands are zero, the hunk is |
| omitted (so this function returns the empty string). |
| |
| ''' |
| if self.is_optional: |
| required = False |
| for op_name in self.op_list: |
| if op_vals[op_name] != 0: |
| required = True |
| break |
| |
| if not required: |
| return '' |
| |
| return ''.join(token.render(cur_pc, op_vals, operands) |
| for token in self.tokens) |
| |
| |
| class InsnSyntax: |
| '''A class representing the syntax of an instruction |
| |
| An instruction's syntax is specified in the YAML file by writing it out |
| with operand names surrounded by angle brackets. For example, a simple NOT |
| instruction might have a syntax of |
| |
| <dst>, <src> |
| |
| which should be interpreted as the following tokens: |
| |
| - Operand called 'dst' |
| - A literal ',' |
| - Operand called 'src' |
| |
| Between the tokens, whitespace is optional (so "x0 , x1" and "x0,x1" both |
| match the syntax above) unless a literal token is just a space, in which |
| case some whitespace is required. For example |
| |
| <dst> <src> |
| |
| would match "x0 x1" but not "x0x1". Whitespace within literal syntax tokens |
| means that some space is required, matching the regex \\s+. For example, |
| the (rather strange) syntax |
| |
| <dst> + - <src> |
| |
| would match "x0 + - x1" or "x0+ -x1", but not "x0 +- x1". |
| |
| Some operands (and surrounding syntax) might be optional. The optional |
| syntax is surrounded by square brackets. Nesting is not supported. For |
| example: |
| |
| <dst>, <src>[, <offset>] |
| |
| would match "x0, x1, 123" or "x0, x1". |
| |
| Note that a given syntax might be ambiguous. For example, |
| |
| <dst>, <src>[, <offset>][, <flavour>] |
| |
| With "x0, x1, 123", is 123 an offset or a flavour? (We choose not to embed |
| typing information into the syntax, because that results in very confusing |
| assembler error messages). We break ties in the same way as the underlying |
| regex engine, assigning the operand to the first group, so 123 is an offset |
| in this case. Such syntaxes are rather confusing though, so probably not a |
| good idea. |
| |
| The parsed syntax is stored as a list of "hunks". Each hunk contains a flag |
| showing whether the hunk is optional or required and also a list of |
| SyntaxToken objects. |
| |
| ''' |
| def __init__(self, |
| hunks: List[SyntaxHunk], |
| op_list: List[str], |
| op_set: Set[str]) -> None: |
| self.hunks = hunks |
| self.op_list = op_list |
| self.op_set = op_set |
| |
| @staticmethod |
| def from_list(operands: List[str]) -> 'InsnSyntax': |
| '''Smart constructor for a list of operands with "normal" syntax''' |
| if not operands: |
| return InsnSyntax([], [], set()) |
| |
| hunk = SyntaxHunk.from_list(operands) |
| return InsnSyntax([hunk], hunk.op_list, hunk.op_set) |
| |
| @staticmethod |
| def from_yaml(mnemonic: str, raw: str) -> 'InsnSyntax': |
| '''Parse the syntax in the YAML file''' |
| |
| # The raw syntax looks something like |
| # |
| # <op0>, <op1>[(<op2>)] |
| # |
| # to mean that you either have "x0, x1" or "x0, x2(x3)". First, split |
| # out the bracketed parts. |
| by_left = raw.split('[') |
| parts = [(False, by_left[0])] |
| for after_left in by_left[1:]: |
| split = after_left.split(']', 1) |
| if len(split) != 2: |
| raise ValueError('Unbalanced or nested [] in instruction ' |
| 'syntax for {!r}.' |
| .format(mnemonic)) |
| |
| parts += [(True, split[0]), (False, split[1])] |
| |
| # Now parts contains a list of pairs (required, txt) where txt is a |
| # hunk of the syntax and req is true if this hunk is required. A part |
| # might be empty. For example, "[a]b c[d]" with both lead and trail |
| # with an empty part. But it shouldn't be empty if it's marked |
| # optional: that would be something like "a[]b", which doesn't make |
| # much sense. |
| hunks = [] |
| for optional, raw in parts: |
| if raw: |
| hunks.append(SyntaxHunk.from_string(mnemonic, optional, raw)) |
| elif optional: |
| raise ValueError('Empty [] in instruction syntax for {!r}.' |
| .format(mnemonic)) |
| |
| # Collect up operands across the hunks |
| op_list = [] |
| op_set = set() |
| for hunk in hunks: |
| op_list += hunk.op_list |
| op_set |= hunk.op_set |
| |
| if len(op_list) != len(op_set): |
| raise ValueError('Instruction syntax for {!r} is not ' |
| 'linear in its operands.' |
| .format(mnemonic)) |
| |
| return InsnSyntax(hunks, op_list, op_set) |
| |
| def render_doc(self) -> str: |
| '''Return how this syntax should look in the documentation''' |
| return ''.join(hunk.render_doc() for hunk in self.hunks) |
| |
| def asm_pattern(self) -> Tuple[str, Dict[str, int]]: |
| '''Return a regex pattern and a group name map for this syntax''' |
| parts = [r'\s*'] |
| for hunk in self.hunks: |
| parts.append(hunk.asm_pattern()) |
| parts.append('$') |
| pattern = ''.join(parts) |
| |
| op_to_grp = {} |
| for idx, op in enumerate(self.op_list): |
| op_to_grp[op] = 1 + idx |
| |
| return (pattern, op_to_grp) |
| |
| def render(self, |
| cur_pc: int, |
| op_vals: Dict[str, int], |
| operands: Dict[str, Operand]) -> List[str]: |
| '''Return an assembly listing for the given operand fields |
| |
| The listings for hunks are returned separately (to allow an instruction |
| to support glued_ops). To generate the final listing, concatenate them. |
| |
| ''' |
| return [hunk.render(cur_pc, op_vals, operands) for hunk in self.hunks] |