blob: a8b02c4bc7536954aa841fedc9512ba26225f96c [file] [log] [blame]
# Copyright lowRISC contributors.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0
'''Support code for reading the instruction database in insns.yml'''
import itertools
import re
from typing import (Callable, Dict, List, Optional,
Sequence, Set, Tuple, TypeVar, Union)
import yaml
T = TypeVar('T')
def check_keys(obj: object,
what: str,
required_keys: List[str],
optional_keys: List[str]) -> Dict[str, object]:
'''Check that obj is a dict object with the expected keys
If not, raise a ValueError; the what argument names the object.
'''
if not isinstance(obj, dict):
raise ValueError("{} is expected to be a dict, but was actually a {}."
.format(what, type(obj).__name__))
allowed = set()
missing = []
for key in required_keys:
assert key not in allowed
allowed.add(key)
if key not in obj:
missing.append(key)
for key in optional_keys:
assert key not in allowed
allowed.add(key)
unexpected = []
for key in obj:
if key not in allowed:
unexpected.append(key)
if missing or unexpected:
mstr = ('The following required fields were missing: {}.'
.format(', '.join(missing)) if missing else '')
ustr = ('The following unexpected fields were found: {}.'
.format(', '.join(unexpected)) if unexpected else '')
raise ValueError("{} doesn't have the right keys. {}{}{}"
.format(what,
mstr,
' ' if mstr and ustr else '',
ustr))
return obj
def check_str(obj: object, what: str) -> str:
'''Check that the given object is a string
If not, raise a ValueError; the what argument names the object.
'''
if not isinstance(obj, str):
raise ValueError('{} is of type {}, not a string.'
.format(what, type(obj).__name__))
return obj
def check_optional_str(obj: object, what: str) -> Optional[str]:
'''Check that the given object is a string or None
If not, raise a ValueError; the what argument names the object.
'''
if obj is not None and not isinstance(obj, str):
raise ValueError('{} is of type {}, not a string.'
.format(what, type(obj).__name__))
return obj
def check_bool(obj: object, what: str) -> bool:
'''Check that the given object is a bool
If not, raise a ValueError; the what argument names the object.
'''
if obj is not True and obj is not False:
raise ValueError('{} is of type {}, not a string.'
.format(what, type(obj).__name__))
return obj
def check_list(obj: object, what: str) -> List[object]:
'''Check that the given object is a list
If not, raise a ValueError; the what argument names the object.
'''
if not isinstance(obj, list):
raise ValueError('{} is of type {}, not a list.'
.format(what, type(obj).__name__))
return obj
def index_list(what: str,
objs: Sequence[T],
get_key: Callable[[T], str]) -> Dict[str, T]:
ret = {}
for obj in objs:
key = get_key(obj)
if key in ret:
raise ValueError('Duplicate object with key {} in {}.'
.format(key, what))
ret[key] = obj
return ret
class InsnGroup:
def __init__(self, yml: object) -> None:
yd = check_keys(yml, 'insn-group', ['key', 'title', 'doc'], [])
self.key = check_str(yd['key'], 'insn-group key')
self.title = check_str(yd['title'], 'insn-group title')
self.doc = check_str(yd['doc'], 'insn-group doc')
class InsnGroups:
def __init__(self, yml: object) -> None:
self.groups = [InsnGroup(y) for y in check_list(yml, 'insn-groups')]
if not self.groups:
raise ValueError('Empty list of instruction groups: '
'we need at least one as a base group.')
self.key_to_group = index_list('insn-groups',
self.groups, lambda ig: ig.key)
def default_group(self) -> str:
'''Get the name of the default instruction group'''
assert self.groups
return self.groups[0].key
class BitRanges:
'''Represents the bit ranges used for a field in an encoding scheme'''
def __init__(self,
mask: int,
ranges: List[Tuple[int, int]],
width: int) -> None:
self.mask = mask
self.ranges = ranges
self.width = width
@staticmethod
def from_list(ranges: List[Tuple[int, int]]) -> 'BitRanges':
mask = 0
width = 0
for msb, lsb in ranges:
assert 0 <= lsb <= msb <= 31
rng_mask = (1 << (msb + 1)) - (1 << lsb)
assert not (rng_mask & mask)
mask |= rng_mask
width += msb - lsb + 1
return BitRanges(mask, ranges, width)
@staticmethod
def from_yaml(as_string: str, what: str) -> 'BitRanges':
# ranges ::= range
# | range ',' ranges
#
# range ::= num
# | num ':' num
#
# Ranges are assumed to be msb:lsb (with msb >= lsb). Bit indices are
# at most 31 and ranges are disjoint.
if not as_string:
raise ValueError('Empty string as bits for {}'.format(what))
overlaps = 0
mask = 0
ranges = []
width = 0
for rng in as_string.split(','):
match = re.match(r'([0-9]+)(?:-([0-9]+))?$', rng)
if match is None:
raise ValueError('Range {!r} in bits for {} is malformed.'
.format(rng, what))
msb = int(match.group(1))
maybe_lsb = match.group(2)
lsb = msb if maybe_lsb is None else int(maybe_lsb)
if msb < lsb:
raise ValueError('Range {!r} in bits for {} has msb < lsb.'
.format(rng, what))
if msb >= 32:
raise ValueError('Range {!r} in bits for {} has msb >= 32.'
.format(rng, what))
rng_mask = (1 << (msb + 1)) - (1 << lsb)
overlaps |= rng_mask & mask
mask |= rng_mask
ranges.append((msb, lsb))
width += msb - lsb + 1
if overlaps:
raise ValueError('Bits for {} have overlapping ranges '
'(mask: {:#08x})'
.format(what, overlaps))
return BitRanges(mask, ranges, width)
def __eq__(self, other: object) -> bool:
return isinstance(other, BitRanges) and self.ranges == other.ranges
def encode(self, value: int) -> int:
'''Encode the given value as bit fields'''
ret = 0
bits_taken = 0
for msb, lsb in self.ranges:
rng_width = msb - lsb + 1
value_msb = self.width - 1 - bits_taken
value_lsb = value_msb - rng_width + 1
rng_mask = (1 << rng_width) - 1
rng_value = (value >> value_lsb) & rng_mask
ret |= rng_value << lsb
bits_taken += rng_width
assert bits_taken == self.width
return ret
def decode(self, raw: int) -> int:
'''Extract the bit fields from the given value'''
ret = 0
for msb, lsb in self.ranges:
width = msb - lsb + 1
mask = (1 << width) - 1
ret <<= width
ret |= (raw >> lsb) & mask
return ret
class BoolLiteral:
'''Represents a boolean literal, with possible 'x characters
We represent this as 2 masks: "ones" and "xs". The ones mask is the bits
that are marked 1. The xs mask is the bits that are marked x. Then you can
test whether a particular value matches the literal by zeroing all bits in
the x mask and then comparing with the ones mask.
'''
def __init__(self, ones: int, xs: int, width: int) -> None:
assert width > 0
assert (ones >> width) == 0
assert (xs >> width) == 0
self.ones = ones
self.xs = xs
self.width = width
@staticmethod
def from_string(as_string: str, what: str) -> 'BoolLiteral':
ones = 0
xs = 0
width = 0
# The literal should always start with a 'b'
if not as_string.startswith('b'):
raise ValueError("Boolean literal for {} doesn't start with a 'b'."
.format(what))
for char in as_string[1:]:
if char == '_':
continue
ones <<= 1
xs <<= 1
width += 1
if char == '0':
continue
elif char == '1':
ones |= 1
elif char == 'x':
xs |= 1
else:
raise ValueError('Boolean literal for {} has '
'unsupported character: {!r}.'
.format(what, char))
if not width:
raise ValueError('Empty boolean literal for {}.'.format(what))
return BoolLiteral(ones, xs, width)
def char_for_bit(self, bit: int) -> str:
'''Return 0, 1 or x for the bit at the given position'''
assert bit < self.width
if (self.ones >> bit) & 1:
return '1'
if (self.xs >> bit) & 1:
return 'x'
return '0'
class EncSchemeField:
'''Represents a single field in an encoding scheme'''
def __init__(self,
bits: BitRanges,
value: Optional[BoolLiteral],
shift: int) -> None:
self.bits = bits
self.value = value
self.shift = shift
@staticmethod
def from_yaml(yml: object, what: str) -> 'EncSchemeField':
# This is either represented as a dict in the YAML or as a bare string.
bits_what = 'bits for {}'.format(what)
value_what = 'value for {}'.format(what)
shift_what = 'shift for {}'.format(what)
shift = 0
if isinstance(yml, dict):
yd = check_keys(yml, what, ['bits'], ['value', 'shift'])
bits_yml = yd['bits']
if not (isinstance(bits_yml, str) or isinstance(bits_yml, int)):
raise ValueError('{} is of type {}, not a string or int.'
.format(bits_what, type(bits_yml).__name__))
# We require value to be given as a string because it's supposed to
# be in base 2, and PyYAML will parse 111 as one-hundred and
# eleven, 011 as 9 and 0x11 as 17. Aargh!
raw_value = None
val_yml = yd.get('value')
if val_yml is not None:
if not isinstance(val_yml, str):
raise ValueError("{} is of type {}, but must be a string "
"(we don't allow automatic conversion "
"because YAML's int conversion assumes "
"base 10 and value should be in base 2)."
.format(value_what,
type(val_yml).__name__))
raw_value = val_yml
# shift, on the other hand, is written in base 10. Allow an
# integer.
shift_yml = yd.get('shift')
if shift_yml is None:
pass
elif isinstance(shift_yml, str):
if not re.match(r'[0-9]+$', shift_yml):
raise ValueError('{} is {!r} but should be a '
'non-negative integer.'
.format(shift_what, shift_yml))
shift = int(shift_yml)
elif isinstance(shift_yml, int):
if shift_yml < 0:
raise ValueError('{} is {!r} but should be a '
'non-negative integer.'
.format(shift_what, shift_yml))
shift = shift_yml
else:
raise ValueError("{} is of type {}, but must be a string "
"or non-negative integer."
.format(shift_what, type(shift_yml).__name__))
elif isinstance(yml, str) or isinstance(yml, int):
bits_yml = yml
raw_value = None
else:
raise ValueError('{} is a {}, but should be a '
'dict, string or integer.'
.format(what, type(yml).__name__))
# The bits field is usually parsed as a string ("10-4", or similar).
# But if it's a bare integer then YAML will parse it as an int. That's
# fine, but we turn it back into a string to be re-parsed by BitRanges.
assert isinstance(bits_yml, str) or isinstance(bits_yml, int)
bits = BitRanges.from_yaml(str(bits_yml), bits_what)
value = None
if raw_value is not None:
value = BoolLiteral.from_string(raw_value, value_what)
if bits.width != value.width:
raise ValueError('{} has bits that imply a width of {}, but '
'a value with width {}.'
.format(what, bits.width, value.width))
return EncSchemeField(bits, value, shift)
class EncSchemeImport:
'''An object representing inheritance of a parent scheme
When importing a parent scheme, we can set some of its fields with
immediate values. These are stored in the settings field.
'''
def __init__(self, yml: object, importer_name: str) -> None:
as_str = check_str(yml,
'value for import in encoding scheme {!r}'
.format(importer_name))
# The supported syntax is
#
# - parent0(field0=b111, field1=b10)
# - parent1()
# - parent2
match = re.match(r'([^ (]+)[ ]*(?:\(([^)]+)\))?$', as_str)
if not match:
raise ValueError('Malformed encoding scheme '
'inheritance by scheme {!r}: {!r}.'
.format(importer_name, as_str))
self.parent = match.group(1)
self.settings = {} # type: Dict[str, BoolLiteral]
when = ('When inheriting from {!r} in encoding scheme {!r}'
.format(self.parent, importer_name))
if match.group(2) is not None:
args = match.group(2).split(',')
for arg in args:
arg = arg.strip()
arg_parts = arg.split('=')
if len(arg_parts) != 2:
raise ValueError('{}, found an argument with {} '
'equals signs (should have exactly one).'
.format(when, len(arg_parts) - 1))
field_name = arg_parts[0]
field_what = ('literal value for field {!r} when inheriting '
'from {!r} in encoding scheme {!r}'
.format(arg_parts[0], self.parent, importer_name))
field_value = BoolLiteral.from_string(arg_parts[1], field_what)
if field_name in self.settings:
raise ValueError('{}, found multiple arguments assigning '
'values to the field {!r}.'
.format(when, field_name))
self.settings[field_name] = field_value
def apply_settings(self,
esf: 'EncSchemeFields', what: str) -> 'EncSchemeFields':
# Copy and set values in anything that has a setting
fields = {}
for name, literal in self.settings.items():
old_field = esf.fields.get(name)
if old_field is None:
raise ValueError('{} sets unknown field {!r} from {!r}.'
.format(what, name, self.parent))
if old_field.bits.width != literal.width:
raise ValueError('{} sets field {!r} from {!r} with a literal '
'of width {}, but the field has width {}.'
.format(what, name, self.parent,
literal.width, old_field.bits.width))
fields[name] = EncSchemeField(old_field.bits,
literal,
old_field.shift)
# Copy anything else
op_fields = set()
for name, old_field in esf.fields.items():
if name in fields:
continue
op_fields.add(name)
fields[name] = old_field
return EncSchemeFields(fields, op_fields, esf.mask)
class EncSchemeFields:
'''An object representing some fields in an encoding scheme'''
def __init__(self,
fields: Dict[str, EncSchemeField],
op_fields: Set[str],
mask: int) -> None:
self.fields = fields
self.op_fields = op_fields
self.mask = mask
@staticmethod
def empty() -> 'EncSchemeFields':
return EncSchemeFields({}, set(), 0)
@staticmethod
def from_yaml(yml: object, name: str) -> 'EncSchemeFields':
if not isinstance(yml, dict):
raise ValueError('fields for encoding scheme {!r} should be a '
'dict, but we saw a {}.'
.format(name, type(yml).__name__))
fields = {}
op_fields = set() # type: Set[str]
mask = 0
overlaps = 0
for key, val in yml.items():
if not isinstance(key, str):
raise ValueError('{!r} is a bad key for a field name of '
'encoding scheme {} (should be str, not {}).'
.format(key, name, type(key).__name__))
fld_what = 'field {!r} of encoding scheme {}'.format(key, name)
field = EncSchemeField.from_yaml(val, fld_what)
overlaps |= mask & field.bits.mask
mask |= field.bits.mask
fields[key] = field
if field.value is None:
op_fields.add(key)
if overlaps:
raise ValueError('Direct fields for encoding scheme {} have '
'overlapping ranges (mask: {:#08x})'
.format(name, overlaps))
return EncSchemeFields(fields, op_fields, mask)
def merge_in(self, right: 'EncSchemeFields', when: str) -> None:
for name, field in right.fields.items():
if name in self.fields:
raise ValueError('Duplicate field name: {!r} {}.'
.format(name, when))
overlap = self.mask & field.bits.mask
if overlap:
raise ValueError('Overlapping bit ranges '
'(masks: {:08x} and {:08x} have '
'intersection {:08x}) {}.'
.format(self.mask,
field.bits.mask, overlap, when))
self.fields[name] = field
self.mask |= field.bits.mask
if field.value is None:
assert name not in self.op_fields
self.op_fields.add(name)
class EncScheme:
def __init__(self, yml: object, name: str) -> None:
what = 'encoding scheme {!r}'.format(name)
yd = check_keys(yml, what, [], ['parents', 'fields'])
if not yd:
raise ValueError('{} has no parents or fields.'.format(what))
fields_yml = yd.get('fields')
self.direct_fields = (EncSchemeFields.from_yaml(fields_yml, name)
if fields_yml is not None
else EncSchemeFields.empty())
parents_yml = yd.get('parents')
parents_what = 'parents of {}'.format(what)
parents = ([EncSchemeImport(y, name)
for y in check_list(parents_yml, parents_what)]
if parents_yml is not None
else [])
self.parents = index_list(parents_what,
parents,
lambda imp: imp.parent)
class EncSchemes:
def __init__(self, yml: object) -> None:
if not isinstance(yml, dict):
raise ValueError("value for encoding-schemes is expected to be "
"a dict, but was actually a {}."
.format(type(yml).__name__))
self.schemes = {} # type: Dict[str, EncScheme]
self.resolved = {} # type: Dict[str, EncSchemeFields]
for key, val in yml.items():
if not isinstance(key, str):
raise ValueError('{!r} is a bad key for an encoding scheme '
'name (should be str, not {}).'
.format(key, type(key).__name__))
self.schemes[key] = EncScheme(val, key)
def _resolve(self,
name: str,
user: str,
stack: List[str]) -> EncSchemeFields:
# Have we resolved this before?
resolved = self.resolved.get(name)
if resolved is not None:
return resolved
# Spot any circular inheritance
if name in stack:
raise RuntimeError('Circular inheritance of encoding '
'schemes: {}'
.format(' -> '.join(stack + [name])))
# Does the scheme actually exist?
scheme = self.schemes.get(name)
if scheme is None:
raise ValueError('{} requires undefined encoding scheme {!r}.'
.format(user, name))
# Recursively try to resolve each parent scheme, applying any import
# settings
resolved_parents = {}
new_stack = stack + [name]
what = 'Import list of encoding scheme {!r}'.format(name)
for pname, pimport in scheme.parents.items():
resolved = self._resolve(pimport.parent, what, new_stack)
resolved_parents[pname] = pimport.apply_settings(resolved, what)
# Now try to merge the resolved imports
merged = EncSchemeFields.empty()
parent_names_so_far = [] # type: List[str]
for pname, pfields in resolved_parents.items():
when = ('merging fields of scheme {} into '
'already merged fields of {}'
.format(pname, ', '.join(parent_names_so_far)))
merged.merge_in(pfields, when)
parent_names_so_far.append(repr(pname))
# Now try to merge in any direct fields
when = ('merging direct fields of scheme {} into fields from parents'
.format(name))
merged.merge_in(scheme.direct_fields, when)
return merged
def resolve(self, name: str, mnemonic: str) -> EncSchemeFields:
fields = self._resolve(name, 'Instruction {!r}'.format(mnemonic), [])
# Check completeness
missing = ((1 << 32) - 1) & ~fields.mask
if missing:
raise ValueError('Fields for encoding scheme {} miss some bits '
'(mask: {:#08x})'
.format(name, missing))
return fields
class OperandType:
'''The base class for some sort of operand type'''
def __init__(self, width: Optional[int]) -> None:
assert width is None or width > 0
self.width = width
def markdown_doc(self) -> Optional[str]:
'''Generate any (markdown) documentation for this operand type
The base class returns None, but subclasses might return something
useful.
'''
return None
def syntax_determines_value(self) -> bool:
'''Can the value of this operand always be inferred from asm syntax?
This is true for things like registers (the value "5" only comes from
"r5", for example), but false for arbitrary immediates: an immediate
operand might have a value that comes from a relocation.
'''
return False
def read_index(self, as_str: str) -> Optional[int]:
'''Try to read the given syntax as an actual integer index
Raises a ValueError on definite failure ("found cabbage when I expected
a register name"). Returns None on a soft failure: "this is a
complicated looking expression, but it might be a sensible immediate".
'''
return None
def render_val(self, value: int) -> str:
'''Render the given value as a string.
The default implementation prints it as a decimal number. Register
operands, for example, will want to print 3 as "x3" and so on.
'''
return str(value)
class RegOperandType(OperandType):
'''A class representing a register operand type'''
TYPE_FMTS = {
'gpr': (5, 'x'),
'wdr': (5, 'w'),
'csr': (12, None),
'wsr': (8, None)
}
def __init__(self, reg_type: str, is_dest: bool):
fmt = RegOperandType.TYPE_FMTS.get(reg_type)
assert fmt is not None
width, _ = fmt
super().__init__(width)
self.reg_type = reg_type
self.is_dest = is_dest
def syntax_determines_value(self) -> bool:
return True
def read_index(self, as_str: str) -> int:
width, pfx = RegOperandType.TYPE_FMTS[self.reg_type]
re_pfx = '' if pfx is None else re.escape(pfx)
match = re.match(re_pfx + '([0-9]+)$', as_str)
if match is None:
raise ValueError("Expression {!r} can't be parsed as a {}."
.format(as_str, self.reg_type))
idx = int(match.group(1))
assert 0 <= idx
if idx >> width:
raise ValueError("Invalid register of type {}: {!r}."
.format(self.reg_type, as_str))
return idx
def render_val(self, value: int) -> str:
fmt = RegOperandType.TYPE_FMTS.get(self.reg_type)
assert fmt is not None
_, pfx = fmt
if pfx is None:
return super().render_val(value)
return '{}{}'.format(pfx, value)
class ImmOperandType(OperandType):
'''A class representing an immediate operand type'''
def markdown_doc(self) -> Optional[str]:
# Override from OperandType base class
if self.width is None:
return None
return 'Valid range: `0..{}`'.format((1 << self.width) - 1)
def read_index(self, as_str: str) -> Optional[int]:
# We only support simple integer literals.
try:
return int(as_str)
except ValueError:
return None
class EnumOperandType(ImmOperandType):
'''A class representing an enum operand type'''
def __init__(self, items: List[str]):
assert items
super().__init__(int.bit_length(len(items) - 1))
self.items = items
def markdown_doc(self) -> Optional[str]:
# Override from OperandType base class
parts = ['Syntax table:\n\n'
'| Syntax | Value of immediate |\n'
'|--------|--------------------|\n']
for idx, item in enumerate(self.items):
parts.append('| `{}` | `{}` |\n'
.format(item, idx))
return ''.join(parts)
def syntax_determines_value(self) -> bool:
return True
def read_index(self, as_str: str) -> Optional[int]:
for idx, item in enumerate(self.items):
if as_str == item:
return idx
known_vals = ', '.join(repr(item) for item in self.items)
raise ValueError('Invalid enum value, {!r}. '
'Supported values: {}.'
.format(as_str, known_vals))
def render_val(self, value: int) -> str:
# On a bad value, we have to return *something*. Since this is just
# going into disassembly, let's be vaguely helpful and return something
# that looks clearly bogus.
#
# Note that if the number of items in the enum is not a power of 2,
# this could happen with a bad binary, despite good tools.
if value < 0 or value >= len(self.items):
return '???'
return self.items[value]
class OptionOperandType(ImmOperandType):
'''A class representing an option operand type'''
def __init__(self, option: str):
super().__init__(1)
self.option = option
def markdown_doc(self) -> Optional[str]:
# Override from OperandType base class
return 'To specify, use the literal syntax `{}`\n'.format(self.option)
def syntax_determines_value(self) -> bool:
return True
def read_index(self, as_str: str) -> Optional[int]:
if as_str == self.option:
return 1
raise ValueError('Invalid option value, {!r}. '
'If specified, it should have been {!r}.'
.format(as_str, self.option))
def render_val(self, value: int) -> str:
# Option types are always 1 bit wide, so the value should be 0 or 1.
assert value in [0, 1]
return self.option if value else ''
def parse_operand_type(fmt: str) -> OperandType:
'''Make sense of the operand type syntax'''
# Registers
if fmt == 'grs':
return RegOperandType('gpr', False)
if fmt == 'grd':
return RegOperandType('gpr', True)
if fmt == 'wrs':
return RegOperandType('wdr', False)
if fmt == 'wrd':
return RegOperandType('wdr', True)
if fmt == 'csr':
return RegOperandType('csr', True)
if fmt == 'wsr':
return RegOperandType('wsr', True)
# Immediates
if fmt == 'imm':
return ImmOperandType(None)
m = re.match(r'imm([1-9][0-9]*)$', fmt)
if m:
return ImmOperandType(int(m.group(1)))
m = re.match(r'enum\(([^\)]+)\)$', fmt)
if m:
return EnumOperandType([item.strip()
for item in m.group(1).split(',')])
m = re.match(r'option\(([^\)]+)\)$', fmt)
if m:
return OptionOperandType(m.group(1).strip())
raise ValueError("Operand type description {!r} "
"didn't match any recognised format."
.format(fmt))
def infer_operand_type(name: str) -> OperandType:
'''Try to guess an operand's type from its name'''
if re.match(r'grs[0-9]*$', name):
return parse_operand_type('grs')
if name in ['grd', 'wrd', 'csr', 'wsr']:
return parse_operand_type(name)
if re.match(r'wrs[0-9]*$', name):
return parse_operand_type('wrs')
if re.match(r'imm[0-9]*$', name):
return parse_operand_type('imm')
if name == 'offset':
return parse_operand_type('imm')
raise ValueError("Operand name {!r} doesn't imply an operand type: "
"you'll have to set the type explicitly."
.format(name))
def make_operand_type(yml: object, operand_name: str) -> OperandType:
'''Construct a type for an operand
This is either based on the type, if given, or inferred from the name
otherwise.
'''
return (parse_operand_type(check_str(yml,
'type for {} operand'
.format(operand_name)))
if yml is not None
else infer_operand_type(operand_name))
def get_optional_str(data: Dict[str, object],
key: str, what: str) -> Optional[str]:
return check_optional_str(data.get(key), '{} field for {}'.format(key, what))
class Operand:
def __init__(self, yml: object, insn_name: str) -> None:
# The YAML representation should be a string (a bare operand name) or a
# dict.
what = 'operand for {!r} instruction'.format(insn_name)
if isinstance(yml, str):
name = yml
op_type = None
doc = None
elif isinstance(yml, dict):
yd = check_keys(yml, what, ['name'], ['type', 'doc'])
name = check_str(yd['name'], 'name of ' + what)
op_what = '{!r} {}'.format(name, what)
op_type = get_optional_str(yd, 'type', op_what)
doc = get_optional_str(yd, 'doc', op_what)
op_what = '{!r} {}'.format(name, what)
self.name = name
self.op_type = make_operand_type(op_type, name)
self.doc = doc
class SyntaxToken:
'''An object representing a single token in an instruction's syntax
See InsnSyntax for more details. The is_literal attribute is true if this
is a literal hunk of text (rather than an operand name). The text attribute
either holds the literal syntax or the operand name.
'''
def __init__(self, is_literal: bool, text: str) -> None:
assert text
self.is_literal = is_literal
# Make whitespace canonical for literals
self.text = re.sub(r'\s+', ' ', text) if is_literal else text
def render_doc(self) -> str:
'''Return how this syntax token should look in the documentation'''
if self.is_literal:
return self.text
else:
return '<{}>'.format(self.text)
def asm_pattern(self) -> str:
'''Return a regex pattern that can be used for matching this token
If the token represents an operand, the pattern is wrapped in a group
(to capture the operand). For more details about the syntax, see
InsnSyntax.
'''
if self.is_literal:
# A literal that is pure whitespace "requires the whitespace".
# Otherwise, replace all internal whitespace with \s+ and allow
# optional whitespace afterwards. To do this easily, we split the
# literal on whitespace. The result is empty iff it was just
# whitespace in the first place.
words = self.text.split()
if not words:
return r'\s+'
# For non-whitespace literals, we disallow leading space and add
# optional trailing space. This convention should avoid lots of
# \s*\s* pairs.
parts = [re.escape(words[0])]
for w in words[1:]:
parts.append(r'\s+')
parts.append(re.escape(w))
parts.append(r'\s*')
return ''.join(parts)
# Otherwise, this is an operand. For now, at least, we're very
# restrictive for operands. No spaces and no commas (the second rule
# avoids silliness like "a, b, c" matching a syntax with only two
# operands by setting the second to "b, c").
#
# We also split out ++ and -- separately, to disambiguate things like
# x1++, which must be parsed as x1 followed by ++.
#
# If we want to do better and allow things like
#
# addi x0, x1, 1 + 3
#
# then we need to use something more serious than just regexes for
# parsing.
return r'([^ ,+\-]+|[+\-]+)\s*'
def render_vals(self,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> str:
'''Return an assembly listing for the given operand fields
'''
if self.is_literal:
return self.text
assert self.text in op_vals
assert self.text in operands
return operands[self.text].op_type.render_val(op_vals[self.text])
class SyntaxHunk:
'''An object representing a hunk of syntax that might be optional'''
def __init__(self,
is_optional: bool,
tokens: List[SyntaxToken],
op_list: List[str],
op_set: Set[str]) -> None:
assert tokens
self.is_optional = is_optional
self.tokens = tokens
self.op_list = op_list
self.op_set = op_set
@staticmethod
def from_list(operands: List[str]) -> 'SyntaxHunk':
'''Smart constructor for a list of operands with "normal" syntax'''
assert operands
comma = SyntaxToken(True, ', ')
tokens = [SyntaxToken(False, operands[0])]
for op in operands[1:]:
tokens.append(comma)
tokens.append(SyntaxToken(False, op))
op_set = set(operands)
assert len(op_set) == len(operands)
return SyntaxHunk(False, tokens, operands, op_set)
@staticmethod
def from_string(mnemonic: str, optional: bool, raw: str) -> 'SyntaxHunk':
'''Smart constructor that parses YAML syntax (see InsnSyntax)'''
assert raw
tokens = []
op_list = []
op_set = set()
parts = re.split(r'<([^>]+)>', raw)
for idx, part in enumerate(parts):
# The matches for the regex appear in positions 1, 3, 5, ...
is_literal = not (idx & 1)
if ('<' in part or '>' in part) and not is_literal:
raise ValueError("Syntax for {!r} has hunk {!r} which doesn't "
"seem to surround <operand>s properly."
.format(mnemonic, raw))
if not is_literal:
assert part
if part in op_set:
raise ValueError("Syntax for {!r} has hunk {!r} with "
"more than one occurrence of <{}>."
.format(mnemonic, raw, part))
op_list.append(part)
op_set.add(part)
# Only allow empty parts (and skip their tokens) if at one end or
# the other
if not part and idx not in [0, len(parts) - 1]:
raise ValueError("Syntax for {!r} has two adjacent operand "
"tokens, with no intervening syntax."
.format(mnemonic))
if part:
tokens.append(SyntaxToken(is_literal, part))
return SyntaxHunk(optional, tokens, op_list, op_set)
def render_doc(self) -> str:
'''Return how this hunk should look in the documentation'''
parts = []
for token in self.tokens:
parts.append(token.render_doc())
body = ''.join(parts)
return '[{}]'.format(body) if self.is_optional else body
def asm_pattern(self) -> str:
'''Return a regex pattern that can be used for matching this hunk
The result will have a group per operand. It allows trailing, but not
leading, space within the hunk.
'''
parts = []
for token in self.tokens:
parts.append(token.asm_pattern())
body = ''.join(parts)
# For an optional hunk, we build it up in the form "(?:foo)?". This
# puts a non-capturing group around foo and then applies "?"
# (one-or-more) to it.
return '(?:{})?'.format(body) if self.is_optional else body
def render_vals(self,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> str:
'''Return an assembly listing for the hunk given operand values
If this hunk is optional and all its operands are zero, the hunk is
omitted (so this function returns the empty string).
'''
if self.is_optional:
required = False
for op_name in self.op_list:
if op_vals[op_name] != 0:
required = True
break
if not required:
return ''
return ''.join(token.render_vals(op_vals, operands)
for token in self.tokens)
class InsnSyntax:
'''A class representing the syntax of an instruction
An instruction's syntax is specified in the YAML file by writing it out
with operand names surrounded by angle brackets. For example, a simple NOT
instruction might have a syntax of
<dst>, <src>
which should be interpreted as the following tokens:
- Operand called 'dst'
- A literal ','
- Operand called 'src'
Between the tokens, whitespace is optional (so "x0 , x1" and "x0,x1" both
match the syntax above) unless a literal token is just a space, in which
case some whitespace is required. For example
<dst> <src>
would match "x0 x1" but not "x0x1". Whitespace within literal syntax tokens
means that some space is required, matching the regex \\s+. For example,
the (rather strange) syntax
<dst> + - <src>
would match "x0 + - x1" or "x0+ -x1", but not "x0 +- x1".
Some operands (and surrounding syntax) might be optional. The optional
syntax is surrounded by square brackets. Nesting is not supported. For
example:
<dst>, <src>[, <offset>]
would match "x0, x1, 123" or "x0, x1".
Note that a given syntax might be ambiguous. For example,
<dst>, <src>[, <offset>][, <flavour>]
With "x0, x1, 123", is 123 an offset or a flavour? (We choose not to embed
typing information into the syntax, because that results in very confusing
assembler error messages). We break ties in the same way as the underlying
regex engine, assigning the operand to the first group, so 123 is an offset
in this case. Such syntaxes are rather confusing though, so probably not a
good idea.
The parsed syntax is stored as a list of "hunks". Each hunk contains a flag
showing whether the hunk is optional or required and also a list of
SyntaxToken objects.
'''
def __init__(self,
hunks: List[SyntaxHunk],
op_list: List[str],
op_set: Set[str]) -> None:
self.hunks = hunks
self.op_list = op_list
self.op_set = op_set
@staticmethod
def from_list(operands: List[str]) -> 'InsnSyntax':
'''Smart constructor for a list of operands with "normal" syntax'''
if not operands:
return InsnSyntax([], [], set())
hunk = SyntaxHunk.from_list(operands)
return InsnSyntax([hunk], hunk.op_list, hunk.op_set)
@staticmethod
def from_yaml(mnemonic: str, raw: str) -> 'InsnSyntax':
'''Parse the syntax in the YAML file'''
# The raw syntax looks something like
#
# <op0>, <op1>[(<op2>)]
#
# to mean that you either have "x0, x1" or "x0, x2(x3)". First, split
# out the bracketed parts.
by_left = raw.split('[')
parts = [(False, by_left[0])]
for after_left in by_left[1:]:
split = after_left.split(']', 1)
if len(split) != 2:
raise ValueError('Unbalanced or nested [] in instruction '
'syntax for {!r}.'
.format(mnemonic))
parts += [(True, split[0]), (False, split[1])]
# Now parts contains a list of pairs (required, txt) where txt is a
# hunk of the syntax and req is true if this hunk is required. A part
# might be empty. For example, "[a]b c[d]" with both lead and trail
# with an empty part. But it shouldn't be empty if it's marked
# optional: that would be something like "a[]b", which doesn't make
# much sense.
hunks = []
for optional, raw in parts:
if raw:
hunks.append(SyntaxHunk.from_string(mnemonic, optional, raw))
elif optional:
raise ValueError('Empty [] in instruction syntax for {!r}.'
.format(mnemonic))
# Collect up operands across the hunks
op_list = []
op_set = set()
for hunk in hunks:
op_list += hunk.op_list
op_set |= hunk.op_set
if len(op_list) != len(op_set):
raise ValueError('Instruction syntax for {!r} is not '
'linear in its operands.'
.format(mnemonic))
return InsnSyntax(hunks, op_list, op_set)
def render_doc(self) -> str:
'''Return how this syntax should look in the documentation'''
return ''.join(hunk.render_doc() for hunk in self.hunks)
def asm_pattern(self) -> Tuple[str, Dict[str, int]]:
'''Return a regex pattern and a group name map for this syntax'''
parts = [r'\s*']
for hunk in self.hunks:
parts.append(hunk.asm_pattern())
parts.append('$')
pattern = ''.join(parts)
op_to_grp = {}
for idx, op in enumerate(self.op_list):
op_to_grp[op] = 1 + idx
return (pattern, op_to_grp)
def render_vals(self,
op_vals: Dict[str, int],
operands: Dict[str, Operand]) -> str:
'''Return an assembly listing for the given operand fields'''
parts = []
for hunk in self.hunks:
parts.append(hunk.render_vals(op_vals, operands))
return ''.join(parts)
class EncodingField:
'''A single element of an encoding's mapping'''
def __init__(self,
value: Union[BoolLiteral, str],
scheme_field: EncSchemeField) -> None:
self.value = value
self.scheme_field = scheme_field
@staticmethod
def from_yaml(as_str: str,
scheme_field: EncSchemeField,
name_to_operand: Dict[str, Operand],
what: str) -> 'EncodingField':
# The value should either be a boolean literal ("000xx11" or similar)
# or should be a name, which is taken as the name of an operand.
if not as_str:
raise ValueError('Empty string as {}.'.format(what))
# Set self.value to be either the bool literal or the name of the
# operand.
value_width = None
value = '' # type: Union[BoolLiteral, str]
if re.match(r'b[01x_]+$', as_str):
value = BoolLiteral.from_string(as_str, what)
value_width = value.width
value_type = 'a literal value'
else:
operand = name_to_operand.get(as_str)
if operand is None:
raise ValueError('Unknown operand, {!r}, as {}'
.format(as_str, what))
value_width = operand.op_type.width
value = as_str
value_type = 'an operand'
# Unless we had an operand of type 'imm' (unknown width), we now have
# an expected width. Check it matches the width of the schema field.
if value_width is not None:
if scheme_field.bits.width != value_width:
raise ValueError('{} is mapped to {} with width {}, but the '
'encoding schema field has width {}.'
.format(what, value_type, value_width,
scheme_field.bits.width))
# Track the scheme field as well (so we don't have to keep track of a
# scheme once we've made an encoding object)
return EncodingField(value, scheme_field)
class Encoding:
'''The encoding for an instruction'''
def __init__(self,
yml: object,
schemes: EncSchemes,
name_to_operand: Dict[str, Operand],
mnemonic: str):
what = 'encoding for instruction {!r}'.format(mnemonic)
yd = check_keys(yml, what, ['scheme', 'mapping'], [])
scheme_what = 'encoding scheme for instruction {!r}'.format(mnemonic)
scheme_name = check_str(yd['scheme'], scheme_what)
scheme_fields = schemes.resolve(scheme_name, mnemonic)
what = 'encoding mapping for instruction {!r}'.format(mnemonic)
# Check we've got exactly the right fields for the scheme
ydm = check_keys(yd['mapping'], what, list(scheme_fields.op_fields), [])
# Track the set of operand names that were used in some field
operands_used = set()
self.fields = {}
for field_name, scheme_field in scheme_fields.fields.items():
if scheme_field.value is not None:
field = EncodingField(scheme_field.value, scheme_field)
else:
field_what = ('value for {} field in encoding for instruction {!r}'
.format(field_name, mnemonic))
field = EncodingField.from_yaml(check_str(ydm[field_name], field_what),
scheme_fields.fields[field_name],
name_to_operand,
field_what)
# If the field's value is an operand rather than a literal, it
# will have type str. Track the operands that we've used.
if isinstance(field.value, str):
operands_used.add(field.value)
self.fields[field_name] = field
# We know that every field in the encoding scheme has a value. But we
# still need to check that every operand ended up in some field.
assert operands_used <= set(name_to_operand.keys())
unused_ops = set(name_to_operand.keys()) - operands_used
if unused_ops:
raise ValueError('Not all operands used in {} (missing: {}).'
.format(what, ', '.join(list(unused_ops))))
def get_masks(self) -> Tuple[int, int]:
'''Return zeros/ones masks for encoding
Returns a pair (m0, m1) where m0 is the "zeros mask": a mask where a
bit is set if there is an bit pattern matching this encoding with that
bit zero. m1 is the ones mask: equivalent, but for that bit one.
'''
m0 = 0
m1 = 0
for field_name, field in self.fields.items():
if isinstance(field.value, str):
m0 |= field.scheme_field.bits.mask
m1 |= field.scheme_field.bits.mask
else:
# Match up the bits in the value with the ranges in the scheme.
assert field.value.width > 0
assert field.value.width == field.scheme_field.bits.width
bits_seen = 0
for msb, lsb in field.scheme_field.bits.ranges:
val_msb = field.scheme_field.bits.width - 1 - bits_seen
val_lsb = val_msb - msb + lsb
bits_seen += msb - lsb + 1
for idx in range(0, msb - lsb + 1):
desc = field.value.char_for_bit(val_lsb + idx)
if desc in ['0', 'x']:
m0 |= 1 << (idx + lsb)
if desc in ['1', 'x']:
m1 |= 1 << (idx + lsb)
all_bits = (1 << 32) - 1
assert (m0 | m1) == all_bits
return (m0, m1)
def get_ones_mask(self) -> int:
'''Return the mask of fixed bits that are set
For literal values of x (unused bits in the encoding), we'll prefer
'0'.
'''
m0, m1 = self.get_masks()
return m1 & ~m0
def assemble(self, op_to_idx: Dict[str, int]) -> int:
'''Assemble an instruction
op_to_idx should map each operand in the encoding to some integer
index, which should be small enough to fit in the width of the
operand's type and should be representable after any shift. Will raise
a ValueError if not.
'''
val = self.get_ones_mask()
for field_name, field in self.fields.items():
if not isinstance(field.value, str):
# We've done this field already (in get_ones_mask)
continue
# Try to get the operand value for the field. If this is an
# optional operand, we might not have one, and just encode zero.
field_val = op_to_idx.get(field.value, 0)
# Are there any low bits that shouldn't be there?
shift_mask = (1 << field.scheme_field.shift) - 1
if field_val & shift_mask:
raise ValueError("operand field {} has a shift of {}, "
"so can't represent the value {:#x}."
.format(field.value,
field.scheme_field.shift,
field_val))
shifted = field_val >> field.scheme_field.shift
# Is the number too big? At the moment, we are assuming immediates
# are unsigned (because the OTBN big number instructions all have
# unsigned immediates).
if shifted >> field.scheme_field.bits.width:
shift_msg = ((' (shifted right by {} bits from {:#x})'
.format(field.scheme_field.shift, field_val))
if field.scheme_field.shift
else '')
raise ValueError("operand field {} has a width of {}, "
"so can't represent the value {:#x}{}."
.format(field.value,
field.scheme_field.bits.width,
shifted, shift_msg))
val |= field.scheme_field.bits.encode(shifted)
return val
class Insn:
def __init__(self,
yml: object,
groups: InsnGroups,
encoding_schemes: EncSchemes) -> None:
yd = check_keys(yml, 'instruction',
['mnemonic', 'operands'],
['group', 'rv32i', 'synopsis',
'syntax', 'doc', 'note', 'trailing-doc',
'decode', 'operation', 'encoding', 'glued-ops'])
self.mnemonic = check_str(yd['mnemonic'], 'mnemonic for instruction')
what = 'instruction with mnemonic {!r}'.format(self.mnemonic)
self.operands = [Operand(y, self.mnemonic)
for y in check_list(yd['operands'],
'operands for ' + what)]
self.name_to_operand = index_list('operands for ' + what,
self.operands,
lambda op: op.name)
raw_group = get_optional_str(yd, 'group', what)
self.group = groups.default_group() if raw_group is None else raw_group
if self.group not in groups.key_to_group:
raise ValueError('Unknown instruction group, {!r}, '
'for mnemonic {!r}.'
.format(self.group, self.mnemonic))
self.rv32i = check_bool(yd.get('rv32i', False),
'rv32i flag for ' + what)
self.glued_ops = check_bool(yd.get('glued-ops', False),
'glued-ops flag for ' + what)
self.synopsis = get_optional_str(yd, 'synopsis', what)
self.doc = get_optional_str(yd, 'doc', what)
self.note = get_optional_str(yd, 'note', what)
self.trailing_doc = get_optional_str(yd, 'trailing-doc', what)
self.decode = get_optional_str(yd, 'decode', what)
self.operation = get_optional_str(yd, 'operation', what)
raw_syntax = get_optional_str(yd, 'syntax', what)
if raw_syntax is not None:
self.syntax = InsnSyntax.from_yaml(self.mnemonic,
raw_syntax.strip())
else:
self.syntax = InsnSyntax.from_list([op.name
for op in self.operands])
pattern, op_to_grp = self.syntax.asm_pattern()
self.asm_pattern = re.compile(pattern)
self.pattern_op_to_grp = op_to_grp
# Make sure we have exactly the operands we expect.
if set(self.name_to_operand.keys()) != self.syntax.op_set:
raise ValueError("Operand syntax for {!r} doesn't have the "
"same list of operands as given in the "
"operand list. The syntax uses {}, "
"but the list of operands gives {}."
.format(self.mnemonic,
list(sorted(self.syntax.op_set)),
list(sorted(self.name_to_operand))))
encoding_yml = yd.get('encoding')
self.encoding = None
if encoding_yml is not None:
self.encoding = Encoding(encoding_yml, encoding_schemes,
self.name_to_operand, self.mnemonic)
def find_ambiguous_encodings(insns: List[Insn]) -> List[Tuple[str, str, int]]:
'''Check for ambiguous instruction encodings
Returns a list of ambiguous pairs (mnemonic0, mnemonic1, bits) where
bits is a bit pattern that would match either instruction.
'''
masks = {}
for insn in insns:
if insn.encoding is not None:
masks[insn.mnemonic] = insn.encoding.get_masks()
ret = []
for mnem0, mnem1 in itertools.combinations(masks.keys(), 2):
m00, m01 = masks[mnem0]
m10, m11 = masks[mnem1]
# The pair of instructions is ambiguous if a bit pattern might be
# either instruction. That happens if each bit index is either
# allowed to be a 0 in both or allowed to be a 1 in both.
# ambiguous_mask is the set of bits that don't distinguish the
# instructions from each other.
m0 = m00 & m10
m1 = m01 & m11
ambiguous_mask = m0 | m1
if ambiguous_mask == (1 << 32) - 1:
ret.append((mnem0, mnem1, m1 & ~m0))
return ret
class InsnsFile:
def __init__(self, yml: object) -> None:
yd = check_keys(yml, 'top-level',
['insn-groups', 'encoding-schemes', 'insns'],
[])
self.groups = InsnGroups(yd['insn-groups'])
self.encoding_schemes = EncSchemes(yd['encoding-schemes'])
self.insns = [Insn(i, self.groups, self.encoding_schemes)
for i in check_list(yd['insns'], 'insns')]
self.mnemonic_to_insn = index_list('insns', self.insns,
lambda insn: insn.mnemonic.lower())
ambiguous_encodings = find_ambiguous_encodings(self.insns)
if ambiguous_encodings:
ambiguity_msgs = []
for mnem0, mnem1, bits in ambiguous_encodings:
ambiguity_msgs.append('{!r} and {!r} '
'both match bit pattern {:#010x}'
.format(mnem0, mnem1, bits))
raise ValueError('Ambiguous instruction encodings: ' +
', '.join(ambiguity_msgs))
def grouped_insns(self) -> List[Tuple[InsnGroup, List[Insn]]]:
'''Return the instructions in groups'''
grp_to_insns = {} # type: Dict[str, List[Insn]]
for insn in self.insns:
grp_to_insns.setdefault(insn.group, []).append(insn)
ret = []
for grp in self.groups.groups:
ret.append((grp, grp_to_insns.get(grp.key, [])))
# We should have picked up all the instructions, because we checked
# that each instruction has a valid group in the Insn constructor. Just
# in case something went wrong, check that the counts match.
gti_count = sum(len(insns) for insns in grp_to_insns.values())
ret_count = sum(len(insns) for _, insns in ret)
assert ret_count == gti_count
return ret
def load_file(path: str) -> InsnsFile:
'''Load the YAML file at path.
Raises a RuntimeError on syntax or schema error.
'''
try:
with open(path, 'r') as handle:
return InsnsFile(yaml.load(handle, Loader=yaml.SafeLoader))
except FileNotFoundError:
raise RuntimeError('Cannot find YAML file at {!r}.'
.format(path)) from None
except yaml.YAMLError as err:
raise RuntimeError('Failed to parse YAML file at {!r}: {}'
.format(path, err)) from None
except ValueError as err:
raise RuntimeError('Invalid schema in YAML file at {!r}: {}'
.format(path, err)) from None