Update Shodan ukernel to match latest IREE ukernel changes Main changes is to merge elementwise_impl.c.inc into elementwise.c Change-Id: I6b8c34de0ea15e83ad9054af3bd16c76f01779f5
diff --git a/vmvx_ukernel/CMakeLists.txt b/vmvx_ukernel/CMakeLists.txt index 4272c4b..cf4edb3 100644 --- a/vmvx_ukernel/CMakeLists.txt +++ b/vmvx_ukernel/CMakeLists.txt
@@ -7,7 +7,6 @@ "${IREE_RUNTIME_SOURCE_DIR}/builtins/ukernel/api.h" SRCS "elementwise.c" - "elementwise_impl.c.inc" "mmt4d_tile.c" "query_tile_sizes.c" "${IREE_RUNTIME_SOURCE_DIR}/builtins/ukernel/mmt4d.c"
diff --git a/vmvx_ukernel/elementwise.c b/vmvx_ukernel/elementwise.c index d0e4683..5a06913 100644 --- a/vmvx_ukernel/elementwise.c +++ b/vmvx_ukernel/elementwise.c
@@ -16,8 +16,365 @@ #include "iree/builtins/ukernel/elementwise.h" -// Include the implementation helpers. -#include "vmvx_ukernel/elementwise_impl.c.inc" +#include <math.h> +#include <riscv_vector.h> + +//===----------------------------------------------------------------------===// +// Helpers for defining generic implementations of elementwise functions. +// Since it affords the best code size tradeoff options, the entrypoint +// is dispatched based on an opcode. +//===----------------------------------------------------------------------===// + +// Opcodes for generic functions operating on 32-bit operands and result. +// Since the outer dispatcher only differentiates based on width, all other +// type specificity is carried by the opcode. +// Binary opcodes are named "X32B" and unary opcodes "X32U". +// The initial list was sorted, and it is encouraged to sort extensions, but +// each opcode must be numerically stable, so the list is not expected to +// be sorted over time. +typedef enum { + IREE_UK_X32B_ADDF = 0, + IREE_UK_X32B_ADDI = 1, + IREE_UK_X32B_ANDI = 2, + IREE_UK_X32B_DIVF = 3, + IREE_UK_X32B_DIVSI = 4, + IREE_UK_X32B_DIVUI = 5, + IREE_UK_X32B_MULF = 6, + IREE_UK_X32B_MULI = 7, + IREE_UK_X32B_ORI = 8, + IREE_UK_X32B_SHLI = 9, + IREE_UK_X32B_SHRSI = 10, + IREE_UK_X32B_SHRUI = 11, + IREE_UK_X32B_SUBF = 12, + IREE_UK_X32B_SUBI = 13, + IREE_UKENREL_X32B_XORI = 14, +} iree_uk_x32b_opcode_t; + +typedef enum { + IREE_UK_X32B_UI = 0, // unsigned integer + IREE_UK_X32B_SI = 1, // signed integer + IREE_UK_X32B_NA = 2, // not available in RVV +} iree_uk_x32b_opcode_type_t; + +typedef enum { + IREE_UK_X32U_ABSF, + IREE_UK_X32U_CEILF, + IREE_UK_X32U_CTLZ, + IREE_UK_X32U_EXPF, + IREE_UK_X32U_FLOORF, + IREE_UK_X32U_LOGF, + IREE_UK_X32U_NEGF, + IREE_UK_X32U_RSQRTF, +} iree_uk_x32u_opcode_t; + +// Macros to access various typed, dereferenced pointers. +#define ASF32(ptr) *((float*)ptr) +#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr) +#define ASSI32(ptr) *((iree_uk_int32_t*)ptr) + +//===----------------------------------------------------------------------===// +// Implementation macros. +//===----------------------------------------------------------------------===// + +// Defines a generic "dispatched" implementation via opcode_t by invoking +// the function iree_uk_generic_{category}_2d. +// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D. +#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category) \ + IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \ + const dtype* lhs, iree_uk_ssize_t lhs_offset, \ + iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, \ + const dtype* rhs, iree_uk_ssize_t rhs_offset, \ + iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, \ + dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, \ + iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, \ + iree_uk_ssize_t size0, iree_uk_ssize_t size1) { \ + return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0, \ + lhs_stride1, rhs, rhs_offset, rhs_stride0, \ + rhs_stride1, out, out_offset, out_stride0, \ + out_stride1, size0, size1); \ + } + +// Defines a generic "dispatched" implementation via opcode_t by invoking +// the function iree_uk_generic_{category}_2d. +// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D. +#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category) \ + IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \ + const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \ + iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out, \ + iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0, \ + iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0, \ + iree_uk_ssize_t size1) { \ + return iree_uk_generic_##category##_2d( \ + opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset, \ + out_stride0, out_stride1, size0, size1); \ + } + +//===----------------------------------------------------------------------===// +// Internal helpers. +//===----------------------------------------------------------------------===// + +static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type( + iree_uk_x32b_opcode_t opcode) { + switch (opcode) { + case IREE_UK_X32B_ADDI: + case IREE_UK_X32B_ANDI: + case IREE_UK_X32B_DIVUI: + case IREE_UK_X32B_MULI: + case IREE_UK_X32B_ORI: + case IREE_UK_X32B_SHLI: + case IREE_UK_X32B_SHRUI: + case IREE_UKENREL_X32B_XORI: + case IREE_UK_X32B_SUBI: + return IREE_UK_X32B_UI; + case IREE_UK_X32B_DIVSI: + return IREE_UK_X32B_SI; + default: + return IREE_UK_X32B_NA; + } +} + +// Computes a single element of an x32b opcode usinbg RVV. +static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code, + const iree_uk_uint32_t* lhs, + iree_uk_ssize_t lhs_stride, + const iree_uk_uint32_t* rhs, + iree_uk_ssize_t rhs_stride, + iree_uk_uint32_t* out, + iree_uk_ssize_t out_stride, size_t vl) { + iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode); + if (op_type == IREE_UK_X32B_UI) { + vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load + vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load + switch (opcode) { + case IREE_UK_X32B_ADDI: + vx = vadd(vx, vy, vl); + break; + case IREE_UK_X32B_ANDI: + vx = vand(vx, vy, vl); + break; + case IREE_UK_X32B_DIVUI: + vx = vdivu(vx, vy, vl); + break; + case IREE_UK_X32B_MULI: + vx = vmul(vx, vy, vl); + break; + case IREE_UK_X32B_ORI: + vx = vor(vx, vy, vl); + break; + case IREE_UK_X32B_SHLI: + vx = vsll(vx, vy, vl); + break; + case IREE_UK_X32B_SHRUI: + vx = vsrl(vx, vy, vl); + break; + case IREE_UKENREL_X32B_XORI: + vx = vor(vx, vy, vl); + break; + case IREE_UK_X32B_SUBI: + vx = vsub(vx, vy, vl); + break; + default: + *result_code = 1; + } + vsse32(out, out_stride, vx, vl); // save + } else if (op_type == IREE_UK_X32B_SI) { + vint32m8_t vx = + vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load + vint32m8_t vy = + vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load + switch (opcode) { + case IREE_UK_X32B_DIVSI: + vx = vdiv(vx, vy, vl); + break; + default: + *result_code = 1; + } + vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save + } else { + *result_code = 1; + } +} + +// Computes a single element of an x32b opcode. On error, should set +// |*result_code| to a non-zero value (but should not touch it otherwise). +static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode, + int* result_code, + const iree_uk_uint32_t* lhs, + const iree_uk_uint32_t* rhs, + iree_uk_uint32_t* out) { + switch (opcode) { + case IREE_UK_X32B_ADDF: + ASF32(out) = ASF32(lhs) + ASF32(rhs); + return; + case IREE_UK_X32B_ADDI: + ASUI32(out) = ASUI32(lhs) + ASUI32(rhs); + return; + case IREE_UK_X32B_ANDI: + ASUI32(out) = ASUI32(lhs) & ASUI32(rhs); + return; + case IREE_UK_X32B_DIVF: + ASF32(out) = ASF32(lhs) / ASF32(rhs); + return; + case IREE_UK_X32B_DIVSI: + ASSI32(out) = ASSI32(lhs) / ASSI32(rhs); + return; + case IREE_UK_X32B_DIVUI: + ASUI32(out) = ASUI32(lhs) / ASUI32(rhs); + return; + case IREE_UK_X32B_MULF: + ASF32(out) = ASF32(lhs) * ASF32(rhs); + return; + case IREE_UK_X32B_MULI: + ASUI32(out) = ASUI32(lhs) * ASUI32(rhs); + return; + case IREE_UK_X32B_ORI: + ASUI32(out) = ASUI32(lhs) | ASUI32(rhs); + return; + case IREE_UK_X32B_SHLI: + ASUI32(out) = ASUI32(lhs) << ASUI32(rhs); + return; + case IREE_UK_X32B_SHRSI: + ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs); + return; + case IREE_UK_X32B_SHRUI: + ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs); + return; + case IREE_UKENREL_X32B_XORI: + ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs); + return; + case IREE_UK_X32B_SUBF: + ASF32(out) = ASF32(lhs) - ASF32(rhs); + return; + case IREE_UK_X32B_SUBI: + ASSI32(out) = ASUI32(lhs) - ASUI32(rhs); + return; + default: + *result_code = 1; + } +} + +// Computes a single element of an x32u opcode. Most are float ops. On error, +// should set |*result_code| to a non-zero value (but should not touch it +// otherwise). +static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode, + int* result_code, + const iree_uk_uint32_t* in, + iree_uk_uint32_t* out) { + switch (opcode) { + case IREE_UK_X32U_ABSF: + ASF32(out) = fabsf(ASF32(in)); + return; + case IREE_UK_X32U_CEILF: + ASF32(out) = ceilf(ASF32(in)); + return; + case IREE_UK_X32U_CTLZ: + ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in)); + return; + case IREE_UK_X32U_EXPF: + ASF32(out) = expf(ASF32(in)); + return; + case IREE_UK_X32U_FLOORF: + ASF32(out) = floorf(ASF32(in)); + return; + case IREE_UK_X32U_LOGF: + ASF32(out) = logf(ASF32(in)); + return; + case IREE_UK_X32U_NEGF: + ASF32(out) = -ASF32(in); + return; + case IREE_UK_X32U_RSQRTF: + ASF32(out) = 1.0f / sqrtf(ASF32(in)); + return; + default: + *result_code = 1; + } +} + +//===----------------------------------------------------------------------===// +// Opcode dispatch entry points. +//===----------------------------------------------------------------------===// + +// 32bit binary kernels. +IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d( + iree_uk_x32b_opcode_t opcode, + // LHS. + const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset, + iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, + // RHS + const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset, + iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, + // OUT. + iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, + iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, + // Sizes. + iree_uk_ssize_t size0, iree_uk_ssize_t size1) { + int result_code = 0; + + if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) { + size_t vl; + // make most use of vectorization by swiching dimension + if (size0 < size1) { + for (iree_uk_ssize_t i = 0; i < size0; ++i) { + for (iree_uk_ssize_t j = 0; j < size1; j += vl) { + vl = vsetvl_e32m8(size1 - j); + iree_uk_rvv_x32b_op(opcode, &result_code, + &lhs[i * lhs_stride0 + j * lhs_stride1], + lhs_stride1 * sizeof(uint32_t), + &rhs[i * rhs_stride0 + j * rhs_stride1], + rhs_stride1 * sizeof(uint32_t), + &out[i * out_stride0 + j * out_stride1], + out_stride1 * sizeof(uint32_t), vl); + } + } + } else { + for (iree_uk_ssize_t j = 0; j < size1; ++j) { + for (iree_uk_ssize_t i = 0; i < size0; i += vl) { + vl = vsetvl_e32m8(size0 - i); + iree_uk_rvv_x32b_op(opcode, &result_code, + &lhs[i * lhs_stride0 + j * lhs_stride1], + lhs_stride0 * sizeof(uint32_t), + &rhs[i * rhs_stride0 + j * rhs_stride1], + rhs_stride0 * sizeof(uint32_t), + &out[i * out_stride0 + j * out_stride1], + out_stride0 * sizeof(uint32_t), vl); + } + } + } + } else { + for (iree_uk_ssize_t i = 0; i < size0; ++i) { + for (iree_uk_ssize_t j = 0; j < size1; ++j) { + iree_uk_generic_x32b_op(opcode, &result_code, + &lhs[i * lhs_stride0 + j * lhs_stride1], + &rhs[i * rhs_stride0 + j * rhs_stride1], + &out[i * out_stride0 + j * out_stride1]); + } + } + } + return result_code; +} + +// Generic 32bit unary kernels. +IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d( + iree_uk_x32u_opcode_t opcode, + // IN. + const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset, + iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1, + // OUT. + iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, + iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, + // Sizes. + iree_uk_ssize_t size0, iree_uk_ssize_t size1) { + int result_code = 0; + // TODO: Manually unroll to x4 to trigger vectorization. + for (iree_uk_ssize_t i = 0; i < size0; ++i) { + for (iree_uk_ssize_t j = 0; j < size1; ++j) { + iree_uk_generic_x32u_op(opcode, &result_code, + &in[i * in_stride0 + j * in_stride1], + &out[i * out_stride0 + j * out_stride1]); + } + } + return result_code; +} DISPATCH_UKERNEL_BINARY_2D(addf, IREE_UK_X32B_ADDF, iree_uk_uint32_t, x32b); DISPATCH_UKERNEL_BINARY_2D(addi, IREE_UK_X32B_ADDI, iree_uk_uint32_t, x32b);
diff --git a/vmvx_ukernel/elementwise_impl.c.inc b/vmvx_ukernel/elementwise_impl.c.inc deleted file mode 100644 index 4de26b0..0000000 --- a/vmvx_ukernel/elementwise_impl.c.inc +++ /dev/null
@@ -1,377 +0,0 @@ -/* - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "iree/builtins/ukernel/common.h" - -#include <math.h> -#include <riscv_vector.h> - -//===----------------------------------------------------------------------===// -// Helpers for defining generic implementations of elementwise functions. -// Since it affords the best code size tradeoff options, the entrypoint -// is dispatched based on an opcode. -//===----------------------------------------------------------------------===// - -// Opcodes for generic functions operating on 32-bit operands and result. -// Since the outer dispatcher only differentiates based on width, all other -// type specificity is carried by the opcode. -// Binary opcodes are named "X32B" and unary opcodes "X32U". -// The initial list was sorted, and it is encouraged to sort extensions, but -// each opcode must be numerically stable, so the list is not expected to -// be sorted over time. -typedef enum { - IREE_UK_X32B_ADDF = 0, - IREE_UK_X32B_ADDI = 1, - IREE_UK_X32B_ANDI = 2, - IREE_UK_X32B_DIVF = 3, - IREE_UK_X32B_DIVSI = 4, - IREE_UK_X32B_DIVUI = 5, - IREE_UK_X32B_MULF = 6, - IREE_UK_X32B_MULI = 7, - IREE_UK_X32B_ORI = 8, - IREE_UK_X32B_SHLI = 9, - IREE_UK_X32B_SHRSI = 10, - IREE_UK_X32B_SHRUI = 11, - IREE_UK_X32B_SUBF = 12, - IREE_UK_X32B_SUBI = 13, - IREE_UKENREL_X32B_XORI = 14, -} iree_uk_x32b_opcode_t; - -typedef enum { - IREE_UK_X32B_UI = 0, // unsigned integer - IREE_UK_X32B_SI = 1, // signed integer - IREE_UK_X32B_NA = 2, // not available in RVV -} iree_uk_x32b_opcode_type_t; - -typedef enum { - IREE_UK_X32U_ABSF, - IREE_UK_X32U_CEILF, - IREE_UK_X32U_CTLZ, - IREE_UK_X32U_EXPF, - IREE_UK_X32U_FLOORF, - IREE_UK_X32U_LOGF, - IREE_UK_X32U_NEGF, - IREE_UK_X32U_RSQRTF, -} iree_uk_x32u_opcode_t; - -// Macros to access various typed, dereferenced pointers. -#define ASF32(ptr) *((float*)ptr) -#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr) -#define ASSI32(ptr) *((iree_uk_int32_t*)ptr) - -//===----------------------------------------------------------------------===// -// Implementation macros. -//===----------------------------------------------------------------------===// - -// Defines a generic "dispatched" implementation via opcode_t by invoking -// the function iree_uk_generic_{category}_2d. -// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D. -#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category) \ - IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \ - const dtype* lhs, iree_uk_ssize_t lhs_offset, \ - iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, \ - const dtype* rhs, iree_uk_ssize_t rhs_offset, \ - iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, \ - dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, \ - iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, \ - iree_uk_ssize_t size0, iree_uk_ssize_t size1) { \ - return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0, \ - lhs_stride1, rhs, rhs_offset, rhs_stride0, \ - rhs_stride1, out, out_offset, out_stride0, \ - out_stride1, size0, size1); \ - } - -// Defines a generic "dispatched" implementation via opcode_t by invoking -// the function iree_uk_generic_{category}_2d. -// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D. -#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category) \ - IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \ - const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \ - iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out, \ - iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0, \ - iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0, \ - iree_uk_ssize_t size1) { \ - return iree_uk_generic_##category##_2d( \ - opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset, \ - out_stride0, out_stride1, size0, size1); \ - } - -//===----------------------------------------------------------------------===// -// Internal helpers. -//===----------------------------------------------------------------------===// - -static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type( - iree_uk_x32b_opcode_t opcode) { - switch (opcode) { - case IREE_UK_X32B_ADDI: - case IREE_UK_X32B_ANDI: - case IREE_UK_X32B_DIVUI: - case IREE_UK_X32B_MULI: - case IREE_UK_X32B_ORI: - case IREE_UK_X32B_SHLI: - case IREE_UK_X32B_SHRUI: - case IREE_UKENREL_X32B_XORI: - case IREE_UK_X32B_SUBI: - return IREE_UK_X32B_UI; - case IREE_UK_X32B_DIVSI: - return IREE_UK_X32B_SI; - default: - return IREE_UK_X32B_NA; - } -} - -// Computes a single element of an x32b opcode usinbg RVV. -static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code, - const iree_uk_uint32_t* lhs, - iree_uk_ssize_t lhs_stride, - const iree_uk_uint32_t* rhs, - iree_uk_ssize_t rhs_stride, - iree_uk_uint32_t* out, - iree_uk_ssize_t out_stride, size_t vl) { - iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode); - if (op_type == IREE_UK_X32B_UI) { - vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load - vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load - switch (opcode) { - case IREE_UK_X32B_ADDI: - vx = vadd(vx, vy, vl); - break; - case IREE_UK_X32B_ANDI: - vx = vand(vx, vy, vl); - break; - case IREE_UK_X32B_DIVUI: - vx = vdivu(vx, vy, vl); - break; - case IREE_UK_X32B_MULI: - vx = vmul(vx, vy, vl); - break; - case IREE_UK_X32B_ORI: - vx = vor(vx, vy, vl); - break; - case IREE_UK_X32B_SHLI: - vx = vsll(vx, vy, vl); - break; - case IREE_UK_X32B_SHRUI: - vx = vsrl(vx, vy, vl); - break; - case IREE_UKENREL_X32B_XORI: - vx = vor(vx, vy, vl); - break; - case IREE_UK_X32B_SUBI: - vx = vsub(vx, vy, vl); - break; - default: - *result_code = 1; - } - vsse32(out, out_stride, vx, vl); // save - } else if (op_type == IREE_UK_X32B_SI) { - vint32m8_t vx = - vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load - vint32m8_t vy = - vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load - switch (opcode) { - case IREE_UK_X32B_DIVSI: - vx = vdiv(vx, vy, vl); - break; - default: - *result_code = 1; - } - vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save - } else { - *result_code = 1; - } -} - -// Computes a single element of an x32b opcode. On error, should set -// |*result_code| to a non-zero value (but should not touch it otherwise). -static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode, - int* result_code, - const iree_uk_uint32_t* lhs, - const iree_uk_uint32_t* rhs, - iree_uk_uint32_t* out) { - switch (opcode) { - case IREE_UK_X32B_ADDF: - ASF32(out) = ASF32(lhs) + ASF32(rhs); - return; - case IREE_UK_X32B_ADDI: - ASUI32(out) = ASUI32(lhs) + ASUI32(rhs); - return; - case IREE_UK_X32B_ANDI: - ASUI32(out) = ASUI32(lhs) & ASUI32(rhs); - return; - case IREE_UK_X32B_DIVF: - ASF32(out) = ASF32(lhs) / ASF32(rhs); - return; - case IREE_UK_X32B_DIVSI: - ASSI32(out) = ASSI32(lhs) / ASSI32(rhs); - return; - case IREE_UK_X32B_DIVUI: - ASUI32(out) = ASUI32(lhs) / ASUI32(rhs); - return; - case IREE_UK_X32B_MULF: - ASF32(out) = ASF32(lhs) * ASF32(rhs); - return; - case IREE_UK_X32B_MULI: - ASUI32(out) = ASUI32(lhs) * ASUI32(rhs); - return; - case IREE_UK_X32B_ORI: - ASUI32(out) = ASUI32(lhs) | ASUI32(rhs); - return; - case IREE_UK_X32B_SHLI: - ASUI32(out) = ASUI32(lhs) << ASUI32(rhs); - return; - case IREE_UK_X32B_SHRSI: - ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs); - return; - case IREE_UK_X32B_SHRUI: - ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs); - return; - case IREE_UKENREL_X32B_XORI: - ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs); - return; - case IREE_UK_X32B_SUBF: - ASF32(out) = ASF32(lhs) - ASF32(rhs); - return; - case IREE_UK_X32B_SUBI: - ASSI32(out) = ASUI32(lhs) - ASUI32(rhs); - return; - default: - *result_code = 1; - } -} - -// Computes a single element of an x32u opcode. Most are float ops. On error, -// should set |*result_code| to a non-zero value (but should not touch it -// otherwise). -static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode, - int* result_code, - const iree_uk_uint32_t* in, - iree_uk_uint32_t* out) { - switch (opcode) { - case IREE_UK_X32U_ABSF: - ASF32(out) = fabsf(ASF32(in)); - return; - case IREE_UK_X32U_CEILF: - ASF32(out) = ceilf(ASF32(in)); - return; - case IREE_UK_X32U_CTLZ: - ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in)); - return; - case IREE_UK_X32U_EXPF: - ASF32(out) = expf(ASF32(in)); - return; - case IREE_UK_X32U_FLOORF: - ASF32(out) = floorf(ASF32(in)); - return; - case IREE_UK_X32U_LOGF: - ASF32(out) = logf(ASF32(in)); - return; - case IREE_UK_X32U_NEGF: - ASF32(out) = -ASF32(in); - return; - case IREE_UK_X32U_RSQRTF: - ASF32(out) = 1.0f / sqrtf(ASF32(in)); - return; - default: - *result_code = 1; - } -} - -//===----------------------------------------------------------------------===// -// Opcode dispatch entry points. -//===----------------------------------------------------------------------===// - -// 32bit binary kernels. -IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d( - iree_uk_x32b_opcode_t opcode, - // LHS. - const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset, - iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, - // RHS - const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset, - iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, - // OUT. - iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, - iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, - // Sizes. - iree_uk_ssize_t size0, iree_uk_ssize_t size1) { - int result_code = 0; - - if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) { - size_t vl; - // make most use of vectorization by swiching dimension - if (size0 < size1) { - for (iree_uk_ssize_t i = 0; i < size0; ++i) { - for (iree_uk_ssize_t j = 0; j < size1; j += vl) { - vl = vsetvl_e32m8(size1 - j); - iree_uk_rvv_x32b_op(opcode, &result_code, - &lhs[i * lhs_stride0 + j * lhs_stride1], - lhs_stride1 * sizeof(uint32_t), - &rhs[i * rhs_stride0 + j * rhs_stride1], - rhs_stride1 * sizeof(uint32_t), - &out[i * out_stride0 + j * out_stride1], - out_stride1 * sizeof(uint32_t), vl); - } - } - } else { - for (iree_uk_ssize_t j = 0; j < size1; ++j) { - for (iree_uk_ssize_t i = 0; i < size0; i += vl) { - vl = vsetvl_e32m8(size0 - i); - iree_uk_rvv_x32b_op(opcode, &result_code, - &lhs[i * lhs_stride0 + j * lhs_stride1], - lhs_stride0 * sizeof(uint32_t), - &rhs[i * rhs_stride0 + j * rhs_stride1], - rhs_stride0 * sizeof(uint32_t), - &out[i * out_stride0 + j * out_stride1], - out_stride0 * sizeof(uint32_t), vl); - } - } - } - } else { - for (iree_uk_ssize_t i = 0; i < size0; ++i) { - for (iree_uk_ssize_t j = 0; j < size1; ++j) { - iree_uk_generic_x32b_op(opcode, &result_code, - &lhs[i * lhs_stride0 + j * lhs_stride1], - &rhs[i * rhs_stride0 + j * rhs_stride1], - &out[i * out_stride0 + j * out_stride1]); - } - } - } - return result_code; -} - -// Generic 32bit unary kernels. -IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d( - iree_uk_x32u_opcode_t opcode, - // IN. - const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset, - iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1, - // OUT. - iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, - iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, - // Sizes. - iree_uk_ssize_t size0, iree_uk_ssize_t size1) { - int result_code = 0; - // TODO: Manually unroll to x4 to trigger vectorization. - for (iree_uk_ssize_t i = 0; i < size0; ++i) { - for (iree_uk_ssize_t j = 0; j < size1; ++j) { - iree_uk_generic_x32u_op(opcode, &result_code, - &in[i * in_stride0 + j * in_stride1], - &out[i * out_stride0 + j * out_stride1]); - } - } - return result_code; -}