blob: acbe5db1aba5f624549bef4dfa0b37817da00025 [file] [log] [blame]
# Copyright lowRISC contributors.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
'''Code for making sense of instruction syntax as defined in insns.yml'''
import re
from typing import Dict, List, Set, Tuple
from .operand import Operand
class SyntaxToken:
'''An object representing a single token in an instruction's syntax
See InsnSyntax for more details. The is_literal attribute is true if this
is a literal hunk of text (rather than an operand name). The text attribute
either holds the literal syntax or the operand name.
'''
def __init__(self, is_literal: bool, text: str) -> None:
assert text
self.is_literal = is_literal
# Make whitespace canonical for literals
self.text = re.sub(r'\s+', ' ', text) if is_literal else text
def render_doc(self) -> str:
'''Return how this syntax token should look in the documentation'''
if self.is_literal:
return self.text
else:
return '<{}>'.format(self.text)
def asm_pattern(self) -> str:
'''Return a regex pattern that can be used for matching this token
If the token represents an operand, the pattern is wrapped in a group
(to capture the operand). For more details about the syntax, see
InsnSyntax.
'''
if self.is_literal:
# A literal that is pure whitespace "requires the whitespace".
# Otherwise, replace all internal whitespace with \s+ and allow
# optional whitespace afterwards. To do this easily, we split the
# literal on whitespace. The result is empty iff it was just
# whitespace in the first place.
words = self.text.split()
if not words:
return r'\s+'
# For non-whitespace literals, we disallow leading space and add
# optional trailing space. This convention should avoid lots of
# \s*\s* pairs.
parts = [re.escape(words[0])]
for w in words[1:]:
parts.append(r'\s+')
parts.append(re.escape(w))
parts.append(r'\s*')
return ''.join(parts)
# Otherwise, this is an operand. For now, at least, we're very
# restrictive for operands. No spaces and no commas (the second rule
# avoids silliness like "a, b, c" matching a syntax with only two
# operands by setting the second to "b, c").
#
# We also split out ++ and -- separately, to disambiguate things like
# x1++, which must be parsed as x1 followed by ++.
#
# However, we do want to allow things like ".+123". To get this right,
# suppose S matches any character other than elements of " ,+-". Then
# we can use a regex like "-?S(\+?-?S)*". This avoids two consecutive +
# or - signs. It also allows .+-3 (i.e. the current PC minus 3). It
# doesn't allow .-+3, but we probably don't care.
#
# If we want to do better and allow things like
#
# addi x0, x1, 1 + 3
#
# then we need to use something more serious than just regexes for
# parsing.
s_re = r'[^ ,+\-]'
not_inc_or_dec = ''.join([r'(?:-?', s_re, r'(?:\+?-?', s_re, r')*)'])
return ''.join([r'(\+\+|--|', not_inc_or_dec, r')\s*'])
def render(self,
cur_pc: int,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> str:
'''Generate an assembly listing for this syntax token
If the syntax token is an operand, that operand is retrieved from
op_vals and rendered.
'''
if self.is_literal:
return self.text
assert self.text in op_vals
assert self.text in operands
op_type = operands[self.text].op_type
return op_type.op_val_to_str(op_vals[self.text], cur_pc)
class SyntaxHunk:
'''An object representing a hunk of syntax that might be optional'''
def __init__(self,
is_optional: bool,
tokens: List[SyntaxToken],
op_list: List[str],
op_set: Set[str]) -> None:
assert tokens
self.is_optional = is_optional
self.tokens = tokens
self.op_list = op_list
self.op_set = op_set
@staticmethod
def from_list(operands: List[str]) -> 'SyntaxHunk':
'''Smart constructor for a list of operands with "normal" syntax'''
assert operands
comma = SyntaxToken(True, ', ')
tokens = [SyntaxToken(False, operands[0])]
for op in operands[1:]:
tokens.append(comma)
tokens.append(SyntaxToken(False, op))
op_set = set(operands)
assert len(op_set) == len(operands)
return SyntaxHunk(False, tokens, operands, op_set)
@staticmethod
def from_string(mnemonic: str, optional: bool, raw: str) -> 'SyntaxHunk':
'''Smart constructor that parses YAML syntax (see InsnSyntax)'''
assert raw
tokens = []
op_list = []
op_set = set()
parts = re.split(r'<([^>]+)>', raw)
for idx, part in enumerate(parts):
# The matches for the regex appear in positions 1, 3, 5, ...
is_literal = not (idx & 1)
if ('<' in part or '>' in part) and not is_literal:
raise ValueError("Syntax for {!r} has hunk {!r} which doesn't "
"seem to surround <operand>s properly."
.format(mnemonic, raw))
if not is_literal:
assert part
if part in op_set:
raise ValueError("Syntax for {!r} has hunk {!r} with "
"more than one occurrence of <{}>."
.format(mnemonic, raw, part))
op_list.append(part)
op_set.add(part)
# Only allow empty parts (and skip their tokens) if at one end or
# the other
if not part and idx not in [0, len(parts) - 1]:
raise ValueError("Syntax for {!r} has two adjacent operand "
"tokens, with no intervening syntax."
.format(mnemonic))
if part:
tokens.append(SyntaxToken(is_literal, part))
return SyntaxHunk(optional, tokens, op_list, op_set)
def render_doc(self) -> str:
'''Return how this hunk should look in the documentation'''
parts = []
for token in self.tokens:
parts.append(token.render_doc())
body = ''.join(parts)
return '[{}]'.format(body) if self.is_optional else body
def asm_pattern(self) -> str:
'''Return a regex pattern that can be used for matching this hunk
The result will have a group per operand. It allows trailing, but not
leading, space within the hunk.
'''
parts = []
for token in self.tokens:
parts.append(token.asm_pattern())
body = ''.join(parts)
# For an optional hunk, we build it up in the form "(?:foo)?". This
# puts a non-capturing group around foo and then applies "?"
# (one-or-more) to it.
return '(?:{})?'.format(body) if self.is_optional else body
def render(self,
cur_pc: int,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> str:
'''Return an assembly listing for the hunk given operand values
If this hunk is optional and all its operands are zero, the hunk is
omitted (so this function returns the empty string).
'''
if self.is_optional:
required = False
for op_name in self.op_list:
if op_vals[op_name] != 0:
required = True
break
if not required:
return ''
return ''.join(token.render(cur_pc, op_vals, operands)
for token in self.tokens)
class InsnSyntax:
'''A class representing the syntax of an instruction
An instruction's syntax is specified in the YAML file by writing it out
with operand names surrounded by angle brackets. For example, a simple NOT
instruction might have a syntax of
<dst>, <src>
which should be interpreted as the following tokens:
- Operand called 'dst'
- A literal ','
- Operand called 'src'
Between the tokens, whitespace is optional (so "x0 , x1" and "x0,x1" both
match the syntax above) unless a literal token is just a space, in which
case some whitespace is required. For example
<dst> <src>
would match "x0 x1" but not "x0x1". Whitespace within literal syntax tokens
means that some space is required, matching the regex \\s+. For example,
the (rather strange) syntax
<dst> + - <src>
would match "x0 + - x1" or "x0+ -x1", but not "x0 +- x1".
Some operands (and surrounding syntax) might be optional. The optional
syntax is surrounded by square brackets. Nesting is not supported. For
example:
<dst>, <src>[, <offset>]
would match "x0, x1, 123" or "x0, x1".
Note that a given syntax might be ambiguous. For example,
<dst>, <src>[, <offset>][, <flavour>]
With "x0, x1, 123", is 123 an offset or a flavour? (We choose not to embed
typing information into the syntax, because that results in very confusing
assembler error messages). We break ties in the same way as the underlying
regex engine, assigning the operand to the first group, so 123 is an offset
in this case. Such syntaxes are rather confusing though, so probably not a
good idea.
The parsed syntax is stored as a list of "hunks". Each hunk contains a flag
showing whether the hunk is optional or required and also a list of
SyntaxToken objects.
'''
def __init__(self,
hunks: List[SyntaxHunk],
op_list: List[str],
op_set: Set[str]) -> None:
self.hunks = hunks
self.op_list = op_list
self.op_set = op_set
@staticmethod
def from_list(operands: List[str]) -> 'InsnSyntax':
'''Smart constructor for a list of operands with "normal" syntax'''
if not operands:
return InsnSyntax([], [], set())
hunk = SyntaxHunk.from_list(operands)
return InsnSyntax([hunk], hunk.op_list, hunk.op_set)
@staticmethod
def from_yaml(mnemonic: str, raw: str) -> 'InsnSyntax':
'''Parse the syntax in the YAML file'''
# The raw syntax looks something like
#
# <op0>, <op1>[(<op2>)]
#
# to mean that you either have "x0, x1" or "x0, x2(x3)". First, split
# out the bracketed parts.
by_left = raw.split('[')
parts = [(False, by_left[0])]
for after_left in by_left[1:]:
split = after_left.split(']', 1)
if len(split) != 2:
raise ValueError('Unbalanced or nested [] in instruction '
'syntax for {!r}.'
.format(mnemonic))
parts += [(True, split[0]), (False, split[1])]
# Now parts contains a list of pairs (required, txt) where txt is a
# hunk of the syntax and req is true if this hunk is required. A part
# might be empty. For example, "[a]b c[d]" with both lead and trail
# with an empty part. But it shouldn't be empty if it's marked
# optional: that would be something like "a[]b", which doesn't make
# much sense.
hunks = []
for optional, raw in parts:
if raw:
hunks.append(SyntaxHunk.from_string(mnemonic, optional, raw))
elif optional:
raise ValueError('Empty [] in instruction syntax for {!r}.'
.format(mnemonic))
# Collect up operands across the hunks
op_list = []
op_set = set()
for hunk in hunks:
op_list += hunk.op_list
op_set |= hunk.op_set
if len(op_list) != len(op_set):
raise ValueError('Instruction syntax for {!r} is not '
'linear in its operands.'
.format(mnemonic))
return InsnSyntax(hunks, op_list, op_set)
def render_doc(self) -> str:
'''Return how this syntax should look in the documentation'''
return ''.join(hunk.render_doc() for hunk in self.hunks)
def asm_pattern(self) -> Tuple[str, Dict[str, int]]:
'''Return a regex pattern and a group name map for this syntax'''
parts = [r'\s*']
for hunk in self.hunks:
parts.append(hunk.asm_pattern())
parts.append('$')
pattern = ''.join(parts)
op_to_grp = {}
for idx, op in enumerate(self.op_list):
op_to_grp[op] = 1 + idx
return (pattern, op_to_grp)
def render(self,
cur_pc: int,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> List[str]:
'''Return an assembly listing for the given operand fields
The listings for hunks are returned separately (to allow an instruction
to support glued_ops). To generate the final listing, concatenate them.
'''
return [hunk.render(cur_pc, op_vals, operands) for hunk in self.hunks]