blob: 4de26b0ca75638db6b68b21973bb3bd2026b2746 [file] [log] [blame]
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "iree/builtins/ukernel/common.h"
#include <math.h>
#include <riscv_vector.h>
//===----------------------------------------------------------------------===//
// Helpers for defining generic implementations of elementwise functions.
// Since it affords the best code size tradeoff options, the entrypoint
// is dispatched based on an opcode.
//===----------------------------------------------------------------------===//
// Opcodes for generic functions operating on 32-bit operands and result.
// Since the outer dispatcher only differentiates based on width, all other
// type specificity is carried by the opcode.
// Binary opcodes are named "X32B" and unary opcodes "X32U".
// The initial list was sorted, and it is encouraged to sort extensions, but
// each opcode must be numerically stable, so the list is not expected to
// be sorted over time.
typedef enum {
IREE_UK_X32B_ADDF = 0,
IREE_UK_X32B_ADDI = 1,
IREE_UK_X32B_ANDI = 2,
IREE_UK_X32B_DIVF = 3,
IREE_UK_X32B_DIVSI = 4,
IREE_UK_X32B_DIVUI = 5,
IREE_UK_X32B_MULF = 6,
IREE_UK_X32B_MULI = 7,
IREE_UK_X32B_ORI = 8,
IREE_UK_X32B_SHLI = 9,
IREE_UK_X32B_SHRSI = 10,
IREE_UK_X32B_SHRUI = 11,
IREE_UK_X32B_SUBF = 12,
IREE_UK_X32B_SUBI = 13,
IREE_UKENREL_X32B_XORI = 14,
} iree_uk_x32b_opcode_t;
typedef enum {
IREE_UK_X32B_UI = 0, // unsigned integer
IREE_UK_X32B_SI = 1, // signed integer
IREE_UK_X32B_NA = 2, // not available in RVV
} iree_uk_x32b_opcode_type_t;
typedef enum {
IREE_UK_X32U_ABSF,
IREE_UK_X32U_CEILF,
IREE_UK_X32U_CTLZ,
IREE_UK_X32U_EXPF,
IREE_UK_X32U_FLOORF,
IREE_UK_X32U_LOGF,
IREE_UK_X32U_NEGF,
IREE_UK_X32U_RSQRTF,
} iree_uk_x32u_opcode_t;
// Macros to access various typed, dereferenced pointers.
#define ASF32(ptr) *((float*)ptr)
#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr)
#define ASSI32(ptr) *((iree_uk_int32_t*)ptr)
//===----------------------------------------------------------------------===//
// Implementation macros.
//===----------------------------------------------------------------------===//
// Defines a generic "dispatched" implementation via opcode_t by invoking
// the function iree_uk_generic_{category}_2d.
// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category) \
IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
const dtype* lhs, iree_uk_ssize_t lhs_offset, \
iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1, \
const dtype* rhs, iree_uk_ssize_t rhs_offset, \
iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1, \
dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset, \
iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1, \
iree_uk_ssize_t size0, iree_uk_ssize_t size1) { \
return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0, \
lhs_stride1, rhs, rhs_offset, rhs_stride0, \
rhs_stride1, out, out_offset, out_stride0, \
out_stride1, size0, size1); \
}
// Defines a generic "dispatched" implementation via opcode_t by invoking
// the function iree_uk_generic_{category}_2d.
// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category) \
IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d( \
const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \
iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out, \
iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0, \
iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0, \
iree_uk_ssize_t size1) { \
return iree_uk_generic_##category##_2d( \
opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset, \
out_stride0, out_stride1, size0, size1); \
}
//===----------------------------------------------------------------------===//
// Internal helpers.
//===----------------------------------------------------------------------===//
static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type(
iree_uk_x32b_opcode_t opcode) {
switch (opcode) {
case IREE_UK_X32B_ADDI:
case IREE_UK_X32B_ANDI:
case IREE_UK_X32B_DIVUI:
case IREE_UK_X32B_MULI:
case IREE_UK_X32B_ORI:
case IREE_UK_X32B_SHLI:
case IREE_UK_X32B_SHRUI:
case IREE_UKENREL_X32B_XORI:
case IREE_UK_X32B_SUBI:
return IREE_UK_X32B_UI;
case IREE_UK_X32B_DIVSI:
return IREE_UK_X32B_SI;
default:
return IREE_UK_X32B_NA;
}
}
// Computes a single element of an x32b opcode usinbg RVV.
static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code,
const iree_uk_uint32_t* lhs,
iree_uk_ssize_t lhs_stride,
const iree_uk_uint32_t* rhs,
iree_uk_ssize_t rhs_stride,
iree_uk_uint32_t* out,
iree_uk_ssize_t out_stride, size_t vl) {
iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
if (op_type == IREE_UK_X32B_UI) {
vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load
vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load
switch (opcode) {
case IREE_UK_X32B_ADDI:
vx = vadd(vx, vy, vl);
break;
case IREE_UK_X32B_ANDI:
vx = vand(vx, vy, vl);
break;
case IREE_UK_X32B_DIVUI:
vx = vdivu(vx, vy, vl);
break;
case IREE_UK_X32B_MULI:
vx = vmul(vx, vy, vl);
break;
case IREE_UK_X32B_ORI:
vx = vor(vx, vy, vl);
break;
case IREE_UK_X32B_SHLI:
vx = vsll(vx, vy, vl);
break;
case IREE_UK_X32B_SHRUI:
vx = vsrl(vx, vy, vl);
break;
case IREE_UKENREL_X32B_XORI:
vx = vor(vx, vy, vl);
break;
case IREE_UK_X32B_SUBI:
vx = vsub(vx, vy, vl);
break;
default:
*result_code = 1;
}
vsse32(out, out_stride, vx, vl); // save
} else if (op_type == IREE_UK_X32B_SI) {
vint32m8_t vx =
vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load
vint32m8_t vy =
vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load
switch (opcode) {
case IREE_UK_X32B_DIVSI:
vx = vdiv(vx, vy, vl);
break;
default:
*result_code = 1;
}
vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save
} else {
*result_code = 1;
}
}
// Computes a single element of an x32b opcode. On error, should set
// |*result_code| to a non-zero value (but should not touch it otherwise).
static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode,
int* result_code,
const iree_uk_uint32_t* lhs,
const iree_uk_uint32_t* rhs,
iree_uk_uint32_t* out) {
switch (opcode) {
case IREE_UK_X32B_ADDF:
ASF32(out) = ASF32(lhs) + ASF32(rhs);
return;
case IREE_UK_X32B_ADDI:
ASUI32(out) = ASUI32(lhs) + ASUI32(rhs);
return;
case IREE_UK_X32B_ANDI:
ASUI32(out) = ASUI32(lhs) & ASUI32(rhs);
return;
case IREE_UK_X32B_DIVF:
ASF32(out) = ASF32(lhs) / ASF32(rhs);
return;
case IREE_UK_X32B_DIVSI:
ASSI32(out) = ASSI32(lhs) / ASSI32(rhs);
return;
case IREE_UK_X32B_DIVUI:
ASUI32(out) = ASUI32(lhs) / ASUI32(rhs);
return;
case IREE_UK_X32B_MULF:
ASF32(out) = ASF32(lhs) * ASF32(rhs);
return;
case IREE_UK_X32B_MULI:
ASUI32(out) = ASUI32(lhs) * ASUI32(rhs);
return;
case IREE_UK_X32B_ORI:
ASUI32(out) = ASUI32(lhs) | ASUI32(rhs);
return;
case IREE_UK_X32B_SHLI:
ASUI32(out) = ASUI32(lhs) << ASUI32(rhs);
return;
case IREE_UK_X32B_SHRSI:
ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs);
return;
case IREE_UK_X32B_SHRUI:
ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs);
return;
case IREE_UKENREL_X32B_XORI:
ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs);
return;
case IREE_UK_X32B_SUBF:
ASF32(out) = ASF32(lhs) - ASF32(rhs);
return;
case IREE_UK_X32B_SUBI:
ASSI32(out) = ASUI32(lhs) - ASUI32(rhs);
return;
default:
*result_code = 1;
}
}
// Computes a single element of an x32u opcode. Most are float ops. On error,
// should set |*result_code| to a non-zero value (but should not touch it
// otherwise).
static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode,
int* result_code,
const iree_uk_uint32_t* in,
iree_uk_uint32_t* out) {
switch (opcode) {
case IREE_UK_X32U_ABSF:
ASF32(out) = fabsf(ASF32(in));
return;
case IREE_UK_X32U_CEILF:
ASF32(out) = ceilf(ASF32(in));
return;
case IREE_UK_X32U_CTLZ:
ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in));
return;
case IREE_UK_X32U_EXPF:
ASF32(out) = expf(ASF32(in));
return;
case IREE_UK_X32U_FLOORF:
ASF32(out) = floorf(ASF32(in));
return;
case IREE_UK_X32U_LOGF:
ASF32(out) = logf(ASF32(in));
return;
case IREE_UK_X32U_NEGF:
ASF32(out) = -ASF32(in);
return;
case IREE_UK_X32U_RSQRTF:
ASF32(out) = 1.0f / sqrtf(ASF32(in));
return;
default:
*result_code = 1;
}
}
//===----------------------------------------------------------------------===//
// Opcode dispatch entry points.
//===----------------------------------------------------------------------===//
// 32bit binary kernels.
IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d(
iree_uk_x32b_opcode_t opcode,
// LHS.
const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset,
iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1,
// RHS
const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset,
iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1,
// OUT.
iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
// Sizes.
iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
int result_code = 0;
if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) {
size_t vl;
// make most use of vectorization by swiching dimension
if (size0 < size1) {
for (iree_uk_ssize_t i = 0; i < size0; ++i) {
for (iree_uk_ssize_t j = 0; j < size1; j += vl) {
vl = vsetvl_e32m8(size1 - j);
iree_uk_rvv_x32b_op(opcode, &result_code,
&lhs[i * lhs_stride0 + j * lhs_stride1],
lhs_stride1 * sizeof(uint32_t),
&rhs[i * rhs_stride0 + j * rhs_stride1],
rhs_stride1 * sizeof(uint32_t),
&out[i * out_stride0 + j * out_stride1],
out_stride1 * sizeof(uint32_t), vl);
}
}
} else {
for (iree_uk_ssize_t j = 0; j < size1; ++j) {
for (iree_uk_ssize_t i = 0; i < size0; i += vl) {
vl = vsetvl_e32m8(size0 - i);
iree_uk_rvv_x32b_op(opcode, &result_code,
&lhs[i * lhs_stride0 + j * lhs_stride1],
lhs_stride0 * sizeof(uint32_t),
&rhs[i * rhs_stride0 + j * rhs_stride1],
rhs_stride0 * sizeof(uint32_t),
&out[i * out_stride0 + j * out_stride1],
out_stride0 * sizeof(uint32_t), vl);
}
}
}
} else {
for (iree_uk_ssize_t i = 0; i < size0; ++i) {
for (iree_uk_ssize_t j = 0; j < size1; ++j) {
iree_uk_generic_x32b_op(opcode, &result_code,
&lhs[i * lhs_stride0 + j * lhs_stride1],
&rhs[i * rhs_stride0 + j * rhs_stride1],
&out[i * out_stride0 + j * out_stride1]);
}
}
}
return result_code;
}
// Generic 32bit unary kernels.
IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d(
iree_uk_x32u_opcode_t opcode,
// IN.
const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset,
iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1,
// OUT.
iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
// Sizes.
iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
int result_code = 0;
// TODO: Manually unroll to x4 to trigger vectorization.
for (iree_uk_ssize_t i = 0; i < size0; ++i) {
for (iree_uk_ssize_t j = 0; j < size1; ++j) {
iree_uk_generic_x32u_op(opcode, &result_code,
&in[i * in_stride0 + j * in_stride1],
&out[i * out_stride0 + j * out_stride1]);
}
}
return result_code;
}