Update Shodan ukernel to match latest IREE ukernel changes
Main changes is to merge elementwise_impl.c.inc into elementwise.c
Change-Id: I6b8c34de0ea15e83ad9054af3bd16c76f01779f5
diff --git a/vmvx_ukernel/CMakeLists.txt b/vmvx_ukernel/CMakeLists.txt
index 4272c4b..cf4edb3 100644
--- a/vmvx_ukernel/CMakeLists.txt
+++ b/vmvx_ukernel/CMakeLists.txt
@@ -7,7 +7,6 @@
"${IREE_RUNTIME_SOURCE_DIR}/builtins/ukernel/api.h"
SRCS
"elementwise.c"
- "elementwise_impl.c.inc"
"mmt4d_tile.c"
"query_tile_sizes.c"
"${IREE_RUNTIME_SOURCE_DIR}/builtins/ukernel/mmt4d.c"
diff --git a/vmvx_ukernel/elementwise.c b/vmvx_ukernel/elementwise.c
index d0e4683..5a06913 100644
--- a/vmvx_ukernel/elementwise.c
+++ b/vmvx_ukernel/elementwise.c
@@ -16,8 +16,365 @@
#include "iree/builtins/ukernel/elementwise.h"
-// Include the implementation helpers.
-#include "vmvx_ukernel/elementwise_impl.c.inc"
+#include <math.h>
+#include <riscv_vector.h>
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining generic implementations of elementwise functions.
+// Since it affords the best code size tradeoff options, the entrypoint
+// is dispatched based on an opcode.
+//===----------------------------------------------------------------------===//
+
+// Opcodes for generic functions operating on 32-bit operands and result.
+// Since the outer dispatcher only differentiates based on width, all other
+// type specificity is carried by the opcode.
+// Binary opcodes are named "X32B" and unary opcodes "X32U".
+// The initial list was sorted, and it is encouraged to sort extensions, but
+// each opcode must be numerically stable, so the list is not expected to
+// be sorted over time.
+typedef enum {
+ IREE_UK_X32B_ADDF = 0,
+ IREE_UK_X32B_ADDI = 1,
+ IREE_UK_X32B_ANDI = 2,
+ IREE_UK_X32B_DIVF = 3,
+ IREE_UK_X32B_DIVSI = 4,
+ IREE_UK_X32B_DIVUI = 5,
+ IREE_UK_X32B_MULF = 6,
+ IREE_UK_X32B_MULI = 7,
+ IREE_UK_X32B_ORI = 8,
+ IREE_UK_X32B_SHLI = 9,
+ IREE_UK_X32B_SHRSI = 10,
+ IREE_UK_X32B_SHRUI = 11,
+ IREE_UK_X32B_SUBF = 12,
+ IREE_UK_X32B_SUBI = 13,
+ IREE_UKENREL_X32B_XORI = 14,
+} iree_uk_x32b_opcode_t;
+
+typedef enum {
+ IREE_UK_X32B_UI = 0, // unsigned integer
+ IREE_UK_X32B_SI = 1, // signed integer
+ IREE_UK_X32B_NA = 2, // not available in RVV
+} iree_uk_x32b_opcode_type_t;
+
+typedef enum {
+ IREE_UK_X32U_ABSF,
+ IREE_UK_X32U_CEILF,
+ IREE_UK_X32U_CTLZ,
+ IREE_UK_X32U_EXPF,
+ IREE_UK_X32U_FLOORF,
+ IREE_UK_X32U_LOGF,
+ IREE_UK_X32U_NEGF,
+ IREE_UK_X32U_RSQRTF,
+} iree_uk_x32u_opcode_t;
+
+// Macros to access various typed, dereferenced pointers.
+#define ASF32(ptr) *((float*)ptr)
+#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr)
+#define ASSI32(ptr) *((iree_uk_int32_t*)ptr)
+
+//===----------------------------------------------------------------------===//
+// Implementation macros.
+//===----------------------------------------------------------------------===//
+
+// Defines a generic "dispatched" implementation via opcode_t by invoking
+// the function iree_uk_generic_{category}_2d.
+// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
+#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category) \
+ IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
+ const dtype* lhs, iree_uk_ssize_t lhs_offset, \
+ iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, \
+ const dtype* rhs, iree_uk_ssize_t rhs_offset, \
+ iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, \
+ dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, \
+ iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, \
+ iree_uk_ssize_t size0, iree_uk_ssize_t size1) { \
+ return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0, \
+ lhs_stride1, rhs, rhs_offset, rhs_stride0, \
+ rhs_stride1, out, out_offset, out_stride0, \
+ out_stride1, size0, size1); \
+ }
+
+// Defines a generic "dispatched" implementation via opcode_t by invoking
+// the function iree_uk_generic_{category}_2d.
+// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
+#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category) \
+ IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
+ const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \
+ iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out, \
+ iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0, \
+ iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0, \
+ iree_uk_ssize_t size1) { \
+ return iree_uk_generic_##category##_2d( \
+ opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset, \
+ out_stride0, out_stride1, size0, size1); \
+ }
+
+//===----------------------------------------------------------------------===//
+// Internal helpers.
+//===----------------------------------------------------------------------===//
+
+static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type(
+ iree_uk_x32b_opcode_t opcode) {
+ switch (opcode) {
+ case IREE_UK_X32B_ADDI:
+ case IREE_UK_X32B_ANDI:
+ case IREE_UK_X32B_DIVUI:
+ case IREE_UK_X32B_MULI:
+ case IREE_UK_X32B_ORI:
+ case IREE_UK_X32B_SHLI:
+ case IREE_UK_X32B_SHRUI:
+ case IREE_UKENREL_X32B_XORI:
+ case IREE_UK_X32B_SUBI:
+ return IREE_UK_X32B_UI;
+ case IREE_UK_X32B_DIVSI:
+ return IREE_UK_X32B_SI;
+ default:
+ return IREE_UK_X32B_NA;
+ }
+}
+
+// Computes a single element of an x32b opcode usinbg RVV.
+static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code,
+ const iree_uk_uint32_t* lhs,
+ iree_uk_ssize_t lhs_stride,
+ const iree_uk_uint32_t* rhs,
+ iree_uk_ssize_t rhs_stride,
+ iree_uk_uint32_t* out,
+ iree_uk_ssize_t out_stride, size_t vl) {
+ iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
+ if (op_type == IREE_UK_X32B_UI) {
+ vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load
+ vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load
+ switch (opcode) {
+ case IREE_UK_X32B_ADDI:
+ vx = vadd(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_ANDI:
+ vx = vand(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_DIVUI:
+ vx = vdivu(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_MULI:
+ vx = vmul(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_ORI:
+ vx = vor(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_SHLI:
+ vx = vsll(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_SHRUI:
+ vx = vsrl(vx, vy, vl);
+ break;
+ case IREE_UKENREL_X32B_XORI:
+ vx = vor(vx, vy, vl);
+ break;
+ case IREE_UK_X32B_SUBI:
+ vx = vsub(vx, vy, vl);
+ break;
+ default:
+ *result_code = 1;
+ }
+ vsse32(out, out_stride, vx, vl); // save
+ } else if (op_type == IREE_UK_X32B_SI) {
+ vint32m8_t vx =
+ vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load
+ vint32m8_t vy =
+ vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load
+ switch (opcode) {
+ case IREE_UK_X32B_DIVSI:
+ vx = vdiv(vx, vy, vl);
+ break;
+ default:
+ *result_code = 1;
+ }
+ vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save
+ } else {
+ *result_code = 1;
+ }
+}
+
+// Computes a single element of an x32b opcode. On error, should set
+// |*result_code| to a non-zero value (but should not touch it otherwise).
+static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode,
+ int* result_code,
+ const iree_uk_uint32_t* lhs,
+ const iree_uk_uint32_t* rhs,
+ iree_uk_uint32_t* out) {
+ switch (opcode) {
+ case IREE_UK_X32B_ADDF:
+ ASF32(out) = ASF32(lhs) + ASF32(rhs);
+ return;
+ case IREE_UK_X32B_ADDI:
+ ASUI32(out) = ASUI32(lhs) + ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_ANDI:
+ ASUI32(out) = ASUI32(lhs) & ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_DIVF:
+ ASF32(out) = ASF32(lhs) / ASF32(rhs);
+ return;
+ case IREE_UK_X32B_DIVSI:
+ ASSI32(out) = ASSI32(lhs) / ASSI32(rhs);
+ return;
+ case IREE_UK_X32B_DIVUI:
+ ASUI32(out) = ASUI32(lhs) / ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_MULF:
+ ASF32(out) = ASF32(lhs) * ASF32(rhs);
+ return;
+ case IREE_UK_X32B_MULI:
+ ASUI32(out) = ASUI32(lhs) * ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_ORI:
+ ASUI32(out) = ASUI32(lhs) | ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_SHLI:
+ ASUI32(out) = ASUI32(lhs) << ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_SHRSI:
+ ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs);
+ return;
+ case IREE_UK_X32B_SHRUI:
+ ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs);
+ return;
+ case IREE_UKENREL_X32B_XORI:
+ ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs);
+ return;
+ case IREE_UK_X32B_SUBF:
+ ASF32(out) = ASF32(lhs) - ASF32(rhs);
+ return;
+ case IREE_UK_X32B_SUBI:
+ ASSI32(out) = ASUI32(lhs) - ASUI32(rhs);
+ return;
+ default:
+ *result_code = 1;
+ }
+}
+
+// Computes a single element of an x32u opcode. Most are float ops. On error,
+// should set |*result_code| to a non-zero value (but should not touch it
+// otherwise).
+static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode,
+ int* result_code,
+ const iree_uk_uint32_t* in,
+ iree_uk_uint32_t* out) {
+ switch (opcode) {
+ case IREE_UK_X32U_ABSF:
+ ASF32(out) = fabsf(ASF32(in));
+ return;
+ case IREE_UK_X32U_CEILF:
+ ASF32(out) = ceilf(ASF32(in));
+ return;
+ case IREE_UK_X32U_CTLZ:
+ ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in));
+ return;
+ case IREE_UK_X32U_EXPF:
+ ASF32(out) = expf(ASF32(in));
+ return;
+ case IREE_UK_X32U_FLOORF:
+ ASF32(out) = floorf(ASF32(in));
+ return;
+ case IREE_UK_X32U_LOGF:
+ ASF32(out) = logf(ASF32(in));
+ return;
+ case IREE_UK_X32U_NEGF:
+ ASF32(out) = -ASF32(in);
+ return;
+ case IREE_UK_X32U_RSQRTF:
+ ASF32(out) = 1.0f / sqrtf(ASF32(in));
+ return;
+ default:
+ *result_code = 1;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Opcode dispatch entry points.
+//===----------------------------------------------------------------------===//
+
+// 32bit binary kernels.
+IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d(
+ iree_uk_x32b_opcode_t opcode,
+ // LHS.
+ const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset,
+ iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1,
+ // RHS
+ const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset,
+ iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1,
+ // OUT.
+ iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
+ iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
+ // Sizes.
+ iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
+ int result_code = 0;
+
+ if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) {
+ size_t vl;
+ // make most use of vectorization by swiching dimension
+ if (size0 < size1) {
+ for (iree_uk_ssize_t i = 0; i < size0; ++i) {
+ for (iree_uk_ssize_t j = 0; j < size1; j += vl) {
+ vl = vsetvl_e32m8(size1 - j);
+ iree_uk_rvv_x32b_op(opcode, &result_code,
+ &lhs[i * lhs_stride0 + j * lhs_stride1],
+ lhs_stride1 * sizeof(uint32_t),
+ &rhs[i * rhs_stride0 + j * rhs_stride1],
+ rhs_stride1 * sizeof(uint32_t),
+ &out[i * out_stride0 + j * out_stride1],
+ out_stride1 * sizeof(uint32_t), vl);
+ }
+ }
+ } else {
+ for (iree_uk_ssize_t j = 0; j < size1; ++j) {
+ for (iree_uk_ssize_t i = 0; i < size0; i += vl) {
+ vl = vsetvl_e32m8(size0 - i);
+ iree_uk_rvv_x32b_op(opcode, &result_code,
+ &lhs[i * lhs_stride0 + j * lhs_stride1],
+ lhs_stride0 * sizeof(uint32_t),
+ &rhs[i * rhs_stride0 + j * rhs_stride1],
+ rhs_stride0 * sizeof(uint32_t),
+ &out[i * out_stride0 + j * out_stride1],
+ out_stride0 * sizeof(uint32_t), vl);
+ }
+ }
+ }
+ } else {
+ for (iree_uk_ssize_t i = 0; i < size0; ++i) {
+ for (iree_uk_ssize_t j = 0; j < size1; ++j) {
+ iree_uk_generic_x32b_op(opcode, &result_code,
+ &lhs[i * lhs_stride0 + j * lhs_stride1],
+ &rhs[i * rhs_stride0 + j * rhs_stride1],
+ &out[i * out_stride0 + j * out_stride1]);
+ }
+ }
+ }
+ return result_code;
+}
+
+// Generic 32bit unary kernels.
+IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d(
+ iree_uk_x32u_opcode_t opcode,
+ // IN.
+ const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset,
+ iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1,
+ // OUT.
+ iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
+ iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
+ // Sizes.
+ iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
+ int result_code = 0;
+ // TODO: Manually unroll to x4 to trigger vectorization.
+ for (iree_uk_ssize_t i = 0; i < size0; ++i) {
+ for (iree_uk_ssize_t j = 0; j < size1; ++j) {
+ iree_uk_generic_x32u_op(opcode, &result_code,
+ &in[i * in_stride0 + j * in_stride1],
+ &out[i * out_stride0 + j * out_stride1]);
+ }
+ }
+ return result_code;
+}
DISPATCH_UKERNEL_BINARY_2D(addf, IREE_UK_X32B_ADDF, iree_uk_uint32_t, x32b);
DISPATCH_UKERNEL_BINARY_2D(addi, IREE_UK_X32B_ADDI, iree_uk_uint32_t, x32b);
diff --git a/vmvx_ukernel/elementwise_impl.c.inc b/vmvx_ukernel/elementwise_impl.c.inc
deleted file mode 100644
index 4de26b0..0000000
--- a/vmvx_ukernel/elementwise_impl.c.inc
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright 2023 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "iree/builtins/ukernel/common.h"
-
-#include <math.h>
-#include <riscv_vector.h>
-
-//===----------------------------------------------------------------------===//
-// Helpers for defining generic implementations of elementwise functions.
-// Since it affords the best code size tradeoff options, the entrypoint
-// is dispatched based on an opcode.
-//===----------------------------------------------------------------------===//
-
-// Opcodes for generic functions operating on 32-bit operands and result.
-// Since the outer dispatcher only differentiates based on width, all other
-// type specificity is carried by the opcode.
-// Binary opcodes are named "X32B" and unary opcodes "X32U".
-// The initial list was sorted, and it is encouraged to sort extensions, but
-// each opcode must be numerically stable, so the list is not expected to
-// be sorted over time.
-typedef enum {
- IREE_UK_X32B_ADDF = 0,
- IREE_UK_X32B_ADDI = 1,
- IREE_UK_X32B_ANDI = 2,
- IREE_UK_X32B_DIVF = 3,
- IREE_UK_X32B_DIVSI = 4,
- IREE_UK_X32B_DIVUI = 5,
- IREE_UK_X32B_MULF = 6,
- IREE_UK_X32B_MULI = 7,
- IREE_UK_X32B_ORI = 8,
- IREE_UK_X32B_SHLI = 9,
- IREE_UK_X32B_SHRSI = 10,
- IREE_UK_X32B_SHRUI = 11,
- IREE_UK_X32B_SUBF = 12,
- IREE_UK_X32B_SUBI = 13,
- IREE_UKENREL_X32B_XORI = 14,
-} iree_uk_x32b_opcode_t;
-
-typedef enum {
- IREE_UK_X32B_UI = 0, // unsigned integer
- IREE_UK_X32B_SI = 1, // signed integer
- IREE_UK_X32B_NA = 2, // not available in RVV
-} iree_uk_x32b_opcode_type_t;
-
-typedef enum {
- IREE_UK_X32U_ABSF,
- IREE_UK_X32U_CEILF,
- IREE_UK_X32U_CTLZ,
- IREE_UK_X32U_EXPF,
- IREE_UK_X32U_FLOORF,
- IREE_UK_X32U_LOGF,
- IREE_UK_X32U_NEGF,
- IREE_UK_X32U_RSQRTF,
-} iree_uk_x32u_opcode_t;
-
-// Macros to access various typed, dereferenced pointers.
-#define ASF32(ptr) *((float*)ptr)
-#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr)
-#define ASSI32(ptr) *((iree_uk_int32_t*)ptr)
-
-//===----------------------------------------------------------------------===//
-// Implementation macros.
-//===----------------------------------------------------------------------===//
-
-// Defines a generic "dispatched" implementation via opcode_t by invoking
-// the function iree_uk_generic_{category}_2d.
-// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
-#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category) \
- IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
- const dtype* lhs, iree_uk_ssize_t lhs_offset, \
- iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, \
- const dtype* rhs, iree_uk_ssize_t rhs_offset, \
- iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, \
- dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, \
- iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, \
- iree_uk_ssize_t size0, iree_uk_ssize_t size1) { \
- return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0, \
- lhs_stride1, rhs, rhs_offset, rhs_stride0, \
- rhs_stride1, out, out_offset, out_stride0, \
- out_stride1, size0, size1); \
- }
-
-// Defines a generic "dispatched" implementation via opcode_t by invoking
-// the function iree_uk_generic_{category}_2d.
-// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
-#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category) \
- IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
- const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \
- iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out, \
- iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0, \
- iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0, \
- iree_uk_ssize_t size1) { \
- return iree_uk_generic_##category##_2d( \
- opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset, \
- out_stride0, out_stride1, size0, size1); \
- }
-
-//===----------------------------------------------------------------------===//
-// Internal helpers.
-//===----------------------------------------------------------------------===//
-
-static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type(
- iree_uk_x32b_opcode_t opcode) {
- switch (opcode) {
- case IREE_UK_X32B_ADDI:
- case IREE_UK_X32B_ANDI:
- case IREE_UK_X32B_DIVUI:
- case IREE_UK_X32B_MULI:
- case IREE_UK_X32B_ORI:
- case IREE_UK_X32B_SHLI:
- case IREE_UK_X32B_SHRUI:
- case IREE_UKENREL_X32B_XORI:
- case IREE_UK_X32B_SUBI:
- return IREE_UK_X32B_UI;
- case IREE_UK_X32B_DIVSI:
- return IREE_UK_X32B_SI;
- default:
- return IREE_UK_X32B_NA;
- }
-}
-
-// Computes a single element of an x32b opcode usinbg RVV.
-static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code,
- const iree_uk_uint32_t* lhs,
- iree_uk_ssize_t lhs_stride,
- const iree_uk_uint32_t* rhs,
- iree_uk_ssize_t rhs_stride,
- iree_uk_uint32_t* out,
- iree_uk_ssize_t out_stride, size_t vl) {
- iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
- if (op_type == IREE_UK_X32B_UI) {
- vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load
- vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load
- switch (opcode) {
- case IREE_UK_X32B_ADDI:
- vx = vadd(vx, vy, vl);
- break;
- case IREE_UK_X32B_ANDI:
- vx = vand(vx, vy, vl);
- break;
- case IREE_UK_X32B_DIVUI:
- vx = vdivu(vx, vy, vl);
- break;
- case IREE_UK_X32B_MULI:
- vx = vmul(vx, vy, vl);
- break;
- case IREE_UK_X32B_ORI:
- vx = vor(vx, vy, vl);
- break;
- case IREE_UK_X32B_SHLI:
- vx = vsll(vx, vy, vl);
- break;
- case IREE_UK_X32B_SHRUI:
- vx = vsrl(vx, vy, vl);
- break;
- case IREE_UKENREL_X32B_XORI:
- vx = vor(vx, vy, vl);
- break;
- case IREE_UK_X32B_SUBI:
- vx = vsub(vx, vy, vl);
- break;
- default:
- *result_code = 1;
- }
- vsse32(out, out_stride, vx, vl); // save
- } else if (op_type == IREE_UK_X32B_SI) {
- vint32m8_t vx =
- vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load
- vint32m8_t vy =
- vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load
- switch (opcode) {
- case IREE_UK_X32B_DIVSI:
- vx = vdiv(vx, vy, vl);
- break;
- default:
- *result_code = 1;
- }
- vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save
- } else {
- *result_code = 1;
- }
-}
-
-// Computes a single element of an x32b opcode. On error, should set
-// |*result_code| to a non-zero value (but should not touch it otherwise).
-static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode,
- int* result_code,
- const iree_uk_uint32_t* lhs,
- const iree_uk_uint32_t* rhs,
- iree_uk_uint32_t* out) {
- switch (opcode) {
- case IREE_UK_X32B_ADDF:
- ASF32(out) = ASF32(lhs) + ASF32(rhs);
- return;
- case IREE_UK_X32B_ADDI:
- ASUI32(out) = ASUI32(lhs) + ASUI32(rhs);
- return;
- case IREE_UK_X32B_ANDI:
- ASUI32(out) = ASUI32(lhs) & ASUI32(rhs);
- return;
- case IREE_UK_X32B_DIVF:
- ASF32(out) = ASF32(lhs) / ASF32(rhs);
- return;
- case IREE_UK_X32B_DIVSI:
- ASSI32(out) = ASSI32(lhs) / ASSI32(rhs);
- return;
- case IREE_UK_X32B_DIVUI:
- ASUI32(out) = ASUI32(lhs) / ASUI32(rhs);
- return;
- case IREE_UK_X32B_MULF:
- ASF32(out) = ASF32(lhs) * ASF32(rhs);
- return;
- case IREE_UK_X32B_MULI:
- ASUI32(out) = ASUI32(lhs) * ASUI32(rhs);
- return;
- case IREE_UK_X32B_ORI:
- ASUI32(out) = ASUI32(lhs) | ASUI32(rhs);
- return;
- case IREE_UK_X32B_SHLI:
- ASUI32(out) = ASUI32(lhs) << ASUI32(rhs);
- return;
- case IREE_UK_X32B_SHRSI:
- ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs);
- return;
- case IREE_UK_X32B_SHRUI:
- ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs);
- return;
- case IREE_UKENREL_X32B_XORI:
- ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs);
- return;
- case IREE_UK_X32B_SUBF:
- ASF32(out) = ASF32(lhs) - ASF32(rhs);
- return;
- case IREE_UK_X32B_SUBI:
- ASSI32(out) = ASUI32(lhs) - ASUI32(rhs);
- return;
- default:
- *result_code = 1;
- }
-}
-
-// Computes a single element of an x32u opcode. Most are float ops. On error,
-// should set |*result_code| to a non-zero value (but should not touch it
-// otherwise).
-static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode,
- int* result_code,
- const iree_uk_uint32_t* in,
- iree_uk_uint32_t* out) {
- switch (opcode) {
- case IREE_UK_X32U_ABSF:
- ASF32(out) = fabsf(ASF32(in));
- return;
- case IREE_UK_X32U_CEILF:
- ASF32(out) = ceilf(ASF32(in));
- return;
- case IREE_UK_X32U_CTLZ:
- ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in));
- return;
- case IREE_UK_X32U_EXPF:
- ASF32(out) = expf(ASF32(in));
- return;
- case IREE_UK_X32U_FLOORF:
- ASF32(out) = floorf(ASF32(in));
- return;
- case IREE_UK_X32U_LOGF:
- ASF32(out) = logf(ASF32(in));
- return;
- case IREE_UK_X32U_NEGF:
- ASF32(out) = -ASF32(in);
- return;
- case IREE_UK_X32U_RSQRTF:
- ASF32(out) = 1.0f / sqrtf(ASF32(in));
- return;
- default:
- *result_code = 1;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Opcode dispatch entry points.
-//===----------------------------------------------------------------------===//
-
-// 32bit binary kernels.
-IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d(
- iree_uk_x32b_opcode_t opcode,
- // LHS.
- const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset,
- iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1,
- // RHS
- const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset,
- iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1,
- // OUT.
- iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
- iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
- // Sizes.
- iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
- int result_code = 0;
-
- if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) {
- size_t vl;
- // make most use of vectorization by swiching dimension
- if (size0 < size1) {
- for (iree_uk_ssize_t i = 0; i < size0; ++i) {
- for (iree_uk_ssize_t j = 0; j < size1; j += vl) {
- vl = vsetvl_e32m8(size1 - j);
- iree_uk_rvv_x32b_op(opcode, &result_code,
- &lhs[i * lhs_stride0 + j * lhs_stride1],
- lhs_stride1 * sizeof(uint32_t),
- &rhs[i * rhs_stride0 + j * rhs_stride1],
- rhs_stride1 * sizeof(uint32_t),
- &out[i * out_stride0 + j * out_stride1],
- out_stride1 * sizeof(uint32_t), vl);
- }
- }
- } else {
- for (iree_uk_ssize_t j = 0; j < size1; ++j) {
- for (iree_uk_ssize_t i = 0; i < size0; i += vl) {
- vl = vsetvl_e32m8(size0 - i);
- iree_uk_rvv_x32b_op(opcode, &result_code,
- &lhs[i * lhs_stride0 + j * lhs_stride1],
- lhs_stride0 * sizeof(uint32_t),
- &rhs[i * rhs_stride0 + j * rhs_stride1],
- rhs_stride0 * sizeof(uint32_t),
- &out[i * out_stride0 + j * out_stride1],
- out_stride0 * sizeof(uint32_t), vl);
- }
- }
- }
- } else {
- for (iree_uk_ssize_t i = 0; i < size0; ++i) {
- for (iree_uk_ssize_t j = 0; j < size1; ++j) {
- iree_uk_generic_x32b_op(opcode, &result_code,
- &lhs[i * lhs_stride0 + j * lhs_stride1],
- &rhs[i * rhs_stride0 + j * rhs_stride1],
- &out[i * out_stride0 + j * out_stride1]);
- }
- }
- }
- return result_code;
-}
-
-// Generic 32bit unary kernels.
-IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d(
- iree_uk_x32u_opcode_t opcode,
- // IN.
- const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset,
- iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1,
- // OUT.
- iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
- iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
- // Sizes.
- iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
- int result_code = 0;
- // TODO: Manually unroll to x4 to trigger vectorization.
- for (iree_uk_ssize_t i = 0; i < size0; ++i) {
- for (iree_uk_ssize_t j = 0; j < size1; ++j) {
- iree_uk_generic_x32u_op(opcode, &result_code,
- &in[i * in_stride0 + j * in_stride1],
- &out[i * out_stride0 + j * out_stride1]);
- }
- }
- return result_code;
-}