| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #ifndef IREE_VM_BYTECODE_DISPATCH_UTIL_H_ |
| #define IREE_VM_BYTECODE_DISPATCH_UTIL_H_ |
| |
| #include <assert.h> |
| #include <string.h> |
| |
| #include "iree/base/api.h" |
| #include "iree/vm/bytecode/module_impl.h" |
| #include "iree/vm/bytecode/utils/isa.h" |
| |
| //===----------------------------------------------------------------------===// |
| // Shared data structures |
| //===----------------------------------------------------------------------===// |
| // |
| // Register bounds checking |
| // ------------------------ |
| // All accesses into the register lists are truncated to the valid range for the |
| // typed bank. This allows us to directly use the register ordinals from the |
| // bytecode without needing to perform any validation at load-time or run-time. |
| // The worst that can happen is that the bytecode program being executed doesn't |
| // work as intended - which, with a working compiler, shouldn't happen. Though |
| // there are cases where the runtime produces the register values and may know |
| // that they are in range it's a good habit to always mask the ordinal by the |
| // type-specific mask so that it's not possible for out of bounds accesses to |
| // sneak in. The iree_vm_registers_t struct is often kept in cache and the |
| // masking is cheap relative to any other validation we could be performing. |
| // |
| // Alternative register widths |
| // --------------------------- |
| // Registers in the VM are just a blob of memory and not physical device |
| // registers. They have a natural width of 32-bits as that covers a majority of |
| // our usage for i32/f32 but can be accessed at larger widths such as 64-bits or |
| // more for vector operations. The base of each frame's register memory is |
| // 16-byte aligned and accessing any individual register as a 32-bit value is |
| // always 4-byte aligned. |
| // |
| // Supporting other register widths is "free" in that the registers for all |
| // widths alias the same register storage memory. This is similar to how |
| // physical registers work in x86 where each register can be accessed at |
| // different sizes (like EAX/RAX alias and the SIMD registers alias as XMM1 is |
| // 128-bit, YMM1 is 256-bit, and ZMM1 is 512-bit but all the same storage). |
| // |
| // The requirements for doing this is that the base alignment for any register |
| // must be a multiple of 4 (due to the native 32-bit storage) AND aligned to the |
| // natural size of the register (so 8 bytes for i64, 16 bytes for v128, etc). |
| // This alignment can easily be done by masking off the low bits such that we |
| // know for any valid `reg` ordinal aligned to 4 bytes `reg/N` will still be |
| // within register storage. For example, i64 registers are accessed as `reg&~1` |
| // to align to 8 bytes starting at byte 0 of the register storage. |
| // |
| // Transferring between register types can be done with vm.ext.* and vm.trunc.* |
| // ops. For example, vm.trunc.i64.i32 will read an 8 byte register and write a |
| // two 4 byte registers (effectively) with hi=0 and lo=the lower 32-bits of the |
| // value. |
| |
| // Pointers to typed register storage. |
| typedef struct iree_vm_registers_t { |
| // 16-byte aligned i32 register array. |
| int32_t* i32; |
| // Naturally aligned ref register array. |
| iree_vm_ref_t* ref; |
| } iree_vm_registers_t; |
| |
| // Storage associated with each stack frame of a bytecode function. |
| // NOTE: we cannot store pointers to the stack in here as the stack may be |
| // reallocated. |
| typedef struct iree_vm_bytecode_frame_storage_t { |
| // Calling convention results fragment. |
| iree_string_view_t cconv_results; |
| |
| // Pointer to a register list within the stack frame where return registers |
| // will be stored by callees upon return. |
| const iree_vm_register_list_t* return_registers; |
| |
| // Counts of each register type and their relative byte offsets from the head |
| // of this struct. |
| uint32_t i32_register_count; |
| uint32_t i32_register_offset; |
| uint32_t ref_register_count; |
| uint32_t ref_register_offset; |
| } iree_vm_bytecode_frame_storage_t; |
| |
| // Maps a type ID to a type def with clamping for out of bounds values. |
| static inline iree_vm_type_def_t iree_vm_map_type( |
| iree_vm_bytecode_module_t* module, int32_t type_id) { |
| type_id = type_id >= module->type_count ? 0 : type_id; |
| return module->type_table[type_id]; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Debugging utilities |
| //===----------------------------------------------------------------------===// |
| |
| #if IREE_VM_EXECUTION_TRACING_FORCE_ENABLE |
| #define IREE_IS_DISPATCH_TRACING_ENABLED() true |
| #else |
| #define IREE_IS_DISPATCH_TRACING_ENABLED() \ |
| !!(iree_vm_stack_invocation_flags(stack) & \ |
| IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION) |
| #endif // IREE_VM_EXECUTION_TRACING_FORCE_ENABLE |
| |
| #if IREE_VM_EXECUTION_TRACING_ENABLE |
| #define IREE_DISPATCH_TRACE_INSTRUCTION(pc_offset, op_name) \ |
| if (IREE_IS_DISPATCH_TRACING_ENABLED()) { \ |
| IREE_RETURN_IF_ERROR(iree_vm_bytecode_trace_disassembly( \ |
| current_frame, (pc - (pc_offset)), ®s, stderr)); \ |
| } |
| |
| #else |
| #define IREE_DISPATCH_TRACE_INSTRUCTION(...) |
| #endif // IREE_VM_EXECUTION_TRACING_ENABLE |
| |
| #if defined(IREE_COMPILER_CLANG) && \ |
| IREE_VM_BYTECODE_DISPATCH_COMPUTED_GOTO_ENABLE |
| #define IREE_DISPATCH_MODE_COMPUTED_GOTO 1 |
| #else |
| #define IREE_DISPATCH_MODE_SWITCH 1 |
| #endif // IREE_VM_BYTECODE_DISPATCH_COMPUTED_GOTO_ENABLE |
| |
| //===----------------------------------------------------------------------===// |
| // Utilities matching the tablegen op encoding scheme |
| //===----------------------------------------------------------------------===// |
| // These utilities match the VM_Enc* statements in VMBase.td 1:1, allowing us |
| // to have the inverse of the encoding which make things easier to read. |
| // |
| // Each macro will increment the pc by the number of bytes read and as such must |
| // be called in the same order the values are encoded. |
| |
| #define VM_DecConstI8(name) \ |
| OP_I8(0); \ |
| ++pc; |
| #define VM_DecConstI32(name) \ |
| OP_I32(0); \ |
| pc += 4; |
| #define VM_DecConstI64(name) \ |
| OP_I64(0); \ |
| pc += 8; |
| #define VM_DecConstF32(name) \ |
| OP_F32(0); \ |
| pc += 4; |
| #define VM_DecConstF64(name) \ |
| OP_F64(0); \ |
| pc += 8; |
| #define VM_DecFuncAttr(name) VM_DecConstI32(name) |
| #define VM_DecGlobalAttr(name) VM_DecConstI32(name) |
| #define VM_DecRodataAttr(name) VM_DecConstI32(name) |
| #define VM_DecType(name) \ |
| iree_vm_map_type(module, OP_I32(0)); \ |
| pc += 4; |
| #define VM_DecTypeOf(name) VM_DecType(name) |
| #define VM_DecIntAttr32(name) VM_DecConstI32(name) |
| #define VM_DecIntAttr64(name) VM_DecConstI64(name) |
| #define VM_DecFloatAttr32(name) VM_DecConstF32(name) |
| #define VM_DecFloatAttr64(name) VM_DecConstF64(name) |
| #define VM_DecStrAttr(name, out_str) \ |
| (out_str)->size = (iree_host_size_t)OP_I16(0); \ |
| (out_str)->data = (const char*)&bytecode_data[pc + 2]; \ |
| pc += 2 + (out_str)->size; |
| #define VM_DecBranchTarget(block_name) VM_DecConstI32(name) |
| #define VM_DecBranchOperands(operands_name) \ |
| VM_DecBranchOperandsImpl(bytecode_data, &pc) |
| static inline const iree_vm_register_remap_list_t* VM_DecBranchOperandsImpl( |
| const uint8_t* IREE_RESTRICT bytecode_data, iree_vm_source_offset_t* pc) { |
| VM_AlignPC(*pc, IREE_REGISTER_ORDINAL_SIZE); |
| const iree_vm_register_remap_list_t* list = |
| (const iree_vm_register_remap_list_t*)&bytecode_data[*pc]; |
| *pc = *pc + IREE_REGISTER_ORDINAL_SIZE + |
| list->size * 2 * IREE_REGISTER_ORDINAL_SIZE; |
| return list; |
| } |
| #define VM_DecOperandRegI32(name) \ |
| regs_i32[OP_I16(0)]; \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecOperandRegI64(name) \ |
| *((int64_t*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecOperandRegI64HostSize(name) \ |
| (iree_host_size_t) VM_DecOperandRegI64(name) |
| #define VM_DecOperandRegF32(name) \ |
| *((float*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecOperandRegF64(name) \ |
| *((double*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecOperandRegRef(name, out_is_move) \ |
| ®s_ref[OP_I16(0) & IREE_REF_REGISTER_MASK]; \ |
| *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecVariadicOperands(name) \ |
| VM_DecVariadicOperandsImpl(bytecode_data, &pc) |
| static inline const iree_vm_register_list_t* VM_DecVariadicOperandsImpl( |
| const uint8_t* IREE_RESTRICT bytecode_data, iree_vm_source_offset_t* pc) { |
| VM_AlignPC(*pc, IREE_REGISTER_ORDINAL_SIZE); |
| const iree_vm_register_list_t* list = |
| (const iree_vm_register_list_t*)&bytecode_data[*pc]; |
| *pc = *pc + IREE_REGISTER_ORDINAL_SIZE + |
| list->size * IREE_REGISTER_ORDINAL_SIZE; |
| return list; |
| } |
| #define VM_DecResultRegI32(name) \ |
| ®s_i32[OP_I16(0)]; \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecResultRegI64(name) \ |
| ((int64_t*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecResultRegF32(name) \ |
| ((float*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecResultRegF64(name) \ |
| ((double*)®s_i32[OP_I16(0)]); \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecResultRegRef(name, out_is_move) \ |
| ®s_ref[OP_I16(0) & IREE_REF_REGISTER_MASK]; \ |
| *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \ |
| pc += IREE_REGISTER_ORDINAL_SIZE; |
| #define VM_DecVariadicResults(name) VM_DecVariadicOperands(name) |
| |
| #define IREE_VM_BLOCK_MARKER_SIZE 1 |
| |
| //===----------------------------------------------------------------------===// |
| // Dispatch table structure |
| //===----------------------------------------------------------------------===// |
| // We support both computed goto (gcc/clang) and switch-based dispatch. Computed |
| // goto is preferred when available as it has the most efficient codegen. MSVC |
| // doesn't support it, though, and there may be other targets (like wasm) that |
| // can only handle the switch-based approach. |
| |
| #if defined(IREE_DISPATCH_MODE_COMPUTED_GOTO) |
| |
| // Dispatch table mapping 1:1 with bytecode ops. |
| // Each entry is a label within this function that can be used for computed |
| // goto. You can find more information on computed goto here: |
| // https://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables |
| // |
| // Note that we ensure the table is 256 elements long exactly to make sure |
| // that unused opcodes are handled gracefully. |
| // |
| // Computed gotos are pretty much the best way to dispatch interpreters but are |
| // not part of the C standard; GCC and clang support them but MSVC does not. |
| // Because the performance difference is significant we support both here but |
| // prefer the computed goto path where available. Empirical data shows them to |
| // still be a win in 2019 on x64 desktops and arm32/arm64 mobile devices. |
| #define BEGIN_DISPATCH_CORE() \ |
| goto* kDispatchTable_CORE[bytecode_data[pc++]]; \ |
| while (1) |
| #define END_DISPATCH_CORE() |
| |
| #define DECLARE_DISPATCH_CORE_OPC(ordinal, name) &&_dispatch_CORE_##name, |
| #define DECLARE_DISPATCH_CORE_RSV(ordinal) &&_dispatch_unhandled, |
| #define DEFINE_DISPATCH_TABLE_CORE() \ |
| static const void* kDispatchTable_CORE[256] = {IREE_VM_OP_CORE_TABLE( \ |
| DECLARE_DISPATCH_CORE_OPC, DECLARE_DISPATCH_CORE_RSV)}; |
| |
| #define DECLARE_DISPATCH_EXT_RSV(ordinal) &&_dispatch_unhandled, |
| #if IREE_VM_EXT_F32_ENABLE |
| #define DECLARE_DISPATCH_EXT_F32_OPC(ordinal, name) &&_dispatch_EXT_F32_##name, |
| #define DEFINE_DISPATCH_TABLE_EXT_F32() \ |
| static const void* kDispatchTable_EXT_F32[256] = {IREE_VM_OP_EXT_F32_TABLE( \ |
| DECLARE_DISPATCH_EXT_F32_OPC, DECLARE_DISPATCH_EXT_RSV)}; |
| #else |
| #define DEFINE_DISPATCH_TABLE_EXT_F32() |
| #endif // IREE_VM_EXT_F32_ENABLE |
| #if IREE_VM_EXT_F64_ENABLE |
| #define DECLARE_DISPATCH_EXT_F64_OPC(ordinal, name) &&_dispatch_EXT_F64_##name, |
| #define DEFINE_DISPATCH_TABLE_EXT_F64() \ |
| static const void* kDispatchTable_EXT_F64[256] = {IREE_VM_OP_EXT_F64_TABLE( \ |
| DECLARE_DISPATCH_EXT_F64_OPC, DECLARE_DISPATCH_EXT_RSV)}; |
| #else |
| #define DEFINE_DISPATCH_TABLE_EXT_F64() |
| #endif // IREE_VM_EXT_F64_ENABLE |
| |
| #define DEFINE_DISPATCH_TABLES() \ |
| DEFINE_DISPATCH_TABLE_CORE(); \ |
| DEFINE_DISPATCH_TABLE_EXT_F32(); \ |
| DEFINE_DISPATCH_TABLE_EXT_F64(); |
| |
| #define DISPATCH_UNHANDLED_CORE() \ |
| _dispatch_unhandled : { \ |
| IREE_ASSERT(0); \ |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unhandled opcode"); \ |
| } |
| #define UNHANDLED_DISPATCH_PREFIX(op_name, ext) \ |
| _dispatch_CORE_##op_name : { \ |
| IREE_ASSERT(0); \ |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, \ |
| "unhandled dispatch extension " #ext); \ |
| } |
| |
| #define DISPATCH_OP(ext, op_name, body) \ |
| _dispatch_##ext##_##op_name:; \ |
| IREE_DISPATCH_TRACE_INSTRUCTION(IREE_VM_PC_OFFSET_##ext, #op_name); \ |
| body; \ |
| goto* kDispatchTable_CORE[bytecode_data[pc++]]; |
| |
| #define BEGIN_DISPATCH_PREFIX(op_name, ext) \ |
| _dispatch_CORE_##op_name : goto* kDispatchTable_##ext[bytecode_data[pc++]]; \ |
| while (1) |
| #define END_DISPATCH_PREFIX() goto* kDispatchTable_CORE[bytecode_data[pc++]]; |
| |
| #else |
| |
| // Switch-based dispatch. This is strictly less efficient than the computed |
| // goto approach above but is universally supported. |
| |
| #define BEGIN_DISPATCH_CORE() \ |
| while (1) { \ |
| switch (bytecode_data[pc++]) |
| #define END_DISPATCH_CORE() } |
| |
| #define DEFINE_DISPATCH_TABLES() |
| |
| #define DISPATCH_UNHANDLED_CORE() \ |
| default: { \ |
| IREE_ASSERT(0); \ |
| IREE_BUILTIN_UNREACHABLE(); /* ok because verified */ \ |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, \ |
| "unhandled core opcode"); \ |
| } |
| #define UNHANDLED_DISPATCH_PREFIX(op_name, ext) \ |
| case IREE_VM_OP_CORE_##op_name: { \ |
| IREE_ASSERT(0); \ |
| IREE_BUILTIN_UNREACHABLE(); /* ok because verified */ \ |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, \ |
| "unhandled dispatch extension " #ext); \ |
| } |
| |
| #define DISPATCH_OP(ext, op_name, body) \ |
| case IREE_VM_OP_##ext##_##op_name: { \ |
| IREE_DISPATCH_TRACE_INSTRUCTION(IREE_VM_PC_OFFSET_##ext, #op_name); \ |
| body; \ |
| } break; |
| |
| #define BEGIN_DISPATCH_PREFIX(op_name, ext) \ |
| case IREE_VM_OP_CORE_##op_name: { \ |
| switch (bytecode_data[pc++]) |
| #define END_DISPATCH_PREFIX() \ |
| break; \ |
| } |
| |
| #endif // IREE_DISPATCH_MODE_COMPUTED_GOTO |
| |
| // Common dispatch op macros |
| |
| #define DISPATCH_OP_CORE_UNARY_I32(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int32_t operand = VM_DecOperandRegI32("operand"); \ |
| int32_t* result = VM_DecResultRegI32("result"); \ |
| *result = op_func(operand); \ |
| }); |
| |
| #define DISPATCH_OP_CORE_UNARY_I64(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int64_t operand = VM_DecOperandRegI64("operand"); \ |
| int64_t* result = VM_DecResultRegI64("result"); \ |
| *result = op_func(operand); \ |
| }); |
| |
| #define DISPATCH_OP_CORE_BINARY_I32(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int32_t lhs = VM_DecOperandRegI32("lhs"); \ |
| int32_t rhs = VM_DecOperandRegI32("rhs"); \ |
| int32_t* result = VM_DecResultRegI32("result"); \ |
| *result = op_func(lhs, rhs); \ |
| }); |
| |
| #define DISPATCH_OP_CORE_BINARY_I64(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int64_t lhs = VM_DecOperandRegI64("lhs"); \ |
| int64_t rhs = VM_DecOperandRegI64("rhs"); \ |
| int64_t* result = VM_DecResultRegI64("result"); \ |
| *result = op_func(lhs, rhs); \ |
| }); |
| |
| #define DISPATCH_OP_CORE_TERNARY_I32(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int32_t a = VM_DecOperandRegI32("a"); \ |
| int32_t b = VM_DecOperandRegI32("b"); \ |
| int32_t c = VM_DecOperandRegI32("c"); \ |
| int32_t* result = VM_DecResultRegI32("result"); \ |
| *result = op_func(a, b, c); \ |
| }); |
| |
| #define DISPATCH_OP_CORE_TERNARY_I64(op_name, op_func) \ |
| DISPATCH_OP(CORE, op_name, { \ |
| int64_t a = VM_DecOperandRegI64("a"); \ |
| int64_t b = VM_DecOperandRegI64("b"); \ |
| int64_t c = VM_DecOperandRegI64("c"); \ |
| int64_t* result = VM_DecResultRegI64("result"); \ |
| *result = op_func(a, b, c); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F32_UNARY_F32(op_name, op_func) \ |
| DISPATCH_OP(EXT_F32, op_name, { \ |
| float operand = VM_DecOperandRegF32("operand"); \ |
| float* result = VM_DecResultRegF32("result"); \ |
| *result = op_func(operand); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F32_BINARY_F32(op_name, op_func) \ |
| DISPATCH_OP(EXT_F32, op_name, { \ |
| float lhs = VM_DecOperandRegF32("lhs"); \ |
| float rhs = VM_DecOperandRegF32("rhs"); \ |
| float* result = VM_DecResultRegF32("result"); \ |
| *result = op_func(lhs, rhs); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F32_TERNARY_F32(op_name, op_func) \ |
| DISPATCH_OP(EXT_F32, op_name, { \ |
| float a = VM_DecOperandRegF32("a"); \ |
| float b = VM_DecOperandRegF32("b"); \ |
| float c = VM_DecOperandRegF32("c"); \ |
| float* result = VM_DecResultRegF32("result"); \ |
| *result = op_func(a, b, c); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F64_UNARY_F64(op_name, op_func) \ |
| DISPATCH_OP(EXT_F64, op_name, { \ |
| double operand = VM_DecOperandRegF64("operand"); \ |
| double* result = VM_DecResultRegF64("result"); \ |
| *result = op_func(operand); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F64_BINARY_F64(op_name, op_func) \ |
| DISPATCH_OP(EXT_F64, op_name, { \ |
| double lhs = VM_DecOperandRegF64("lhs"); \ |
| double rhs = VM_DecOperandRegF64("rhs"); \ |
| double* result = VM_DecResultRegF64("result"); \ |
| *result = op_func(lhs, rhs); \ |
| }); |
| |
| #define DISPATCH_OP_EXT_F64_TERNARY_F64(op_name, op_func) \ |
| DISPATCH_OP(EXT_F64, op_name, { \ |
| double a = VM_DecOperandRegF64("a"); \ |
| double b = VM_DecOperandRegF64("b"); \ |
| double c = VM_DecOperandRegF64("c"); \ |
| double* result = VM_DecResultRegF64("result"); \ |
| *result = op_func(a, b, c); \ |
| }); |
| |
| #endif // IREE_VM_BYTECODE_DISPATCH_UTIL_H_ |