/*
 * Copyright 2023 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "iree/builtins/ukernel/common.h"

#include <math.h>
#include <riscv_vector.h>

//===----------------------------------------------------------------------===//
// Helpers for defining generic implementations of elementwise functions.
// Since it affords the best code size tradeoff options, the entrypoint
// is dispatched based on an opcode.
//===----------------------------------------------------------------------===//

// Opcodes for generic functions operating on 32-bit operands and result.
// Since the outer dispatcher only differentiates based on width, all other
// type specificity is carried by the opcode.
// Binary opcodes are named "X32B" and unary opcodes "X32U".
// The initial list was sorted, and it is encouraged to sort extensions, but
// each opcode must be numerically stable, so the list is not expected to
// be sorted over time.
typedef enum {
  IREE_UK_X32B_ADDF = 0,
  IREE_UK_X32B_ADDI = 1,
  IREE_UK_X32B_ANDI = 2,
  IREE_UK_X32B_DIVF = 3,
  IREE_UK_X32B_DIVSI = 4,
  IREE_UK_X32B_DIVUI = 5,
  IREE_UK_X32B_MULF = 6,
  IREE_UK_X32B_MULI = 7,
  IREE_UK_X32B_ORI = 8,
  IREE_UK_X32B_SHLI = 9,
  IREE_UK_X32B_SHRSI = 10,
  IREE_UK_X32B_SHRUI = 11,
  IREE_UK_X32B_SUBF = 12,
  IREE_UK_X32B_SUBI = 13,
  IREE_UKENREL_X32B_XORI = 14,
} iree_uk_x32b_opcode_t;

typedef enum {
  IREE_UK_X32B_UI = 0,  // unsigned integer
  IREE_UK_X32B_SI = 1,  // signed integer
  IREE_UK_X32B_NA = 2,  // not available in RVV
} iree_uk_x32b_opcode_type_t;

typedef enum {
  IREE_UK_X32U_ABSF,
  IREE_UK_X32U_CEILF,
  IREE_UK_X32U_CTLZ,
  IREE_UK_X32U_EXPF,
  IREE_UK_X32U_FLOORF,
  IREE_UK_X32U_LOGF,
  IREE_UK_X32U_NEGF,
  IREE_UK_X32U_RSQRTF,
} iree_uk_x32u_opcode_t;

// Macros to access various typed, dereferenced pointers.
#define ASF32(ptr) *((float*)ptr)
#define ASUI32(ptr) *((iree_uk_uint32_t*)ptr)
#define ASSI32(ptr) *((iree_uk_int32_t*)ptr)

//===----------------------------------------------------------------------===//
// Math helper functions (extracted from base/internal/math.h and adapted
// to be able to be used standalone).
//===----------------------------------------------------------------------===//
static inline int iree_uk_count_leading_zeros_u32(const iree_uk_uint32_t n) {
#if defined(__GNUC__) || defined(__clang__)
  // Handle 0 as a special case because __builtin_clz(0) is undefined.
  if (n == 0) return 32;
  // Use __builtin_clz, which uses the following instructions:
  //  x86: bsr
  //  ARM64: clz
  //  PPC: cntlzd
  return (int)__builtin_clz(n);
#else
#error No clz for this arch.
#endif  // GCC / CLANG
}

//===----------------------------------------------------------------------===//
// Implementation macros.
//===----------------------------------------------------------------------===//

// Defines a generic "dispatched" implementation via opcode_t by invoking
// the function iree_uk_generic_{category}_2d.
// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
#define DISPATCH_UKERNEL_BINARY_2D(opcode, opcode_t, dtype, category)         \
  IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d(                      \
      const dtype* lhs, iree_uk_ssize_t lhs_offset,                           \
      iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1,               \
      const dtype* rhs, iree_uk_ssize_t rhs_offset,                           \
      iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1,               \
      dtype* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,                \
      iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,               \
      iree_uk_ssize_t size0, iree_uk_ssize_t size1) {                         \
    return iree_uk_##category##_2d(opcode_t, lhs, lhs_offset, lhs_stride0,    \
                                   lhs_stride1, rhs, rhs_offset, rhs_stride0, \
                                   rhs_stride1, out, out_offset, out_stride0, \
                                   out_stride1, size0, size1);                \
  }

// Defines a generic "dispatched" implementation via opcode_t by invoking
// the function iree_uk_generic_{category}_2d.
// Corresponds to the header macro DECLARE_UKERNEL_BINARY_2D.
#define DISPATCH_UKERNEL_UNARY_2D(opcode, opcode_t, dtype, category)          \
  IREE_UK_EXPORT int iree_uk_##category##_##opcode##_2d(                      \
      const dtype* in, iree_uk_ssize_t in_offset, iree_uk_ssize_t in_stride0, \
      iree_uk_ssize_t in_stride1, dtype* IREE_UK_RESTRICT out,                \
      iree_uk_ssize_t out_offset, iree_uk_ssize_t out_stride0,                \
      iree_uk_ssize_t out_stride1, iree_uk_ssize_t size0,                     \
      iree_uk_ssize_t size1) {                                                \
    return iree_uk_generic_##category##_2d(                                   \
        opcode_t, in, in_offset, in_stride0, in_stride1, out, out_offset,     \
        out_stride0, out_stride1, size0, size1);                              \
  }

//===----------------------------------------------------------------------===//
// Internal helpers.
//===----------------------------------------------------------------------===//

static iree_uk_x32b_opcode_type_t get_iree_uk_x32b_op_type(
    iree_uk_x32b_opcode_t opcode) {
  switch (opcode) {
    case IREE_UK_X32B_ADDI:
    case IREE_UK_X32B_ANDI:
    case IREE_UK_X32B_DIVUI:
    case IREE_UK_X32B_MULI:
    case IREE_UK_X32B_ORI:
    case IREE_UK_X32B_SHLI:
    case IREE_UK_X32B_SHRUI:
    case IREE_UKENREL_X32B_XORI:
    case IREE_UK_X32B_SUBI:
      return IREE_UK_X32B_UI;
    case IREE_UK_X32B_DIVSI:
      return IREE_UK_X32B_SI;
    default:
      return IREE_UK_X32B_NA;
  }
}

// Computes a single element of an x32b opcode usinbg RVV.
static void iree_uk_rvv_x32b_op(iree_uk_x32b_opcode_t opcode, int* result_code,
                                const iree_uk_uint32_t* lhs,
                                iree_uk_ssize_t lhs_stride,
                                const iree_uk_uint32_t* rhs,
                                iree_uk_ssize_t rhs_stride,
                                iree_uk_uint32_t* out,
                                iree_uk_ssize_t out_stride, size_t vl) {
  iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
  if (op_type == IREE_UK_X32B_UI) {
    vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl);  // load
    vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl);  // load
    switch (opcode) {
      case IREE_UK_X32B_ADDI:
        vx = vadd(vx, vy, vl);
        break;
      case IREE_UK_X32B_ANDI:
        vx = vand(vx, vy, vl);
        break;
      case IREE_UK_X32B_DIVUI:
        vx = vdivu(vx, vy, vl);
        break;
      case IREE_UK_X32B_MULI:
        vx = vmul(vx, vy, vl);
        break;
      case IREE_UK_X32B_ORI:
        vx = vor(vx, vy, vl);
        break;
      case IREE_UK_X32B_SHLI:
        vx = vsll(vx, vy, vl);
        break;
      case IREE_UK_X32B_SHRUI:
        vx = vsrl(vx, vy, vl);
        break;
      case IREE_UKENREL_X32B_XORI:
        vx = vor(vx, vy, vl);
        break;
      case IREE_UK_X32B_SUBI:
        vx = vsub(vx, vy, vl);
        break;
      default:
        *result_code = 1;
    }
    vsse32(out, out_stride, vx, vl);  // save
  } else if (op_type == IREE_UK_X32B_SI) {
    vint32m8_t vx =
        vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl);  // load
    vint32m8_t vy =
        vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl);  // load
    switch (opcode) {
      case IREE_UK_X32B_DIVSI:
        vx = vdiv(vx, vy, vl);
        break;
      default:
        *result_code = 1;
    }
    vsse32((iree_uk_int32_t*)out, out_stride, vx, vl);  // save
  } else {
    *result_code = 1;
  }
}

// Computes a single element of an x32b opcode. On error, should set
// |*result_code| to a non-zero value (but should not touch it otherwise).
static void iree_uk_generic_x32b_op(iree_uk_x32b_opcode_t opcode,
                                    int* result_code,
                                    const iree_uk_uint32_t* lhs,
                                    const iree_uk_uint32_t* rhs,
                                    iree_uk_uint32_t* out) {
  switch (opcode) {
    case IREE_UK_X32B_ADDF:
      ASF32(out) = ASF32(lhs) + ASF32(rhs);
      return;
    case IREE_UK_X32B_ADDI:
      ASUI32(out) = ASUI32(lhs) + ASUI32(rhs);
      return;
    case IREE_UK_X32B_ANDI:
      ASUI32(out) = ASUI32(lhs) & ASUI32(rhs);
      return;
    case IREE_UK_X32B_DIVF:
      ASF32(out) = ASF32(lhs) / ASF32(rhs);
      return;
    case IREE_UK_X32B_DIVSI:
      ASSI32(out) = ASSI32(lhs) / ASSI32(rhs);
      return;
    case IREE_UK_X32B_DIVUI:
      ASUI32(out) = ASUI32(lhs) / ASUI32(rhs);
      return;
    case IREE_UK_X32B_MULF:
      ASF32(out) = ASF32(lhs) * ASF32(rhs);
      return;
    case IREE_UK_X32B_MULI:
      ASUI32(out) = ASUI32(lhs) * ASUI32(rhs);
      return;
    case IREE_UK_X32B_ORI:
      ASUI32(out) = ASUI32(lhs) | ASUI32(rhs);
      return;
    case IREE_UK_X32B_SHLI:
      ASUI32(out) = ASUI32(lhs) << ASUI32(rhs);
      return;
    case IREE_UK_X32B_SHRSI:
      ASSI32(out) = ASSI32(lhs) >> ASSI32(rhs);
      return;
    case IREE_UK_X32B_SHRUI:
      ASUI32(out) = ASUI32(lhs) >> ASUI32(rhs);
      return;
    case IREE_UKENREL_X32B_XORI:
      ASUI32(out) = ASUI32(lhs) ^ ASUI32(rhs);
      return;
    case IREE_UK_X32B_SUBF:
      ASF32(out) = ASF32(lhs) - ASF32(rhs);
      return;
    case IREE_UK_X32B_SUBI:
      ASSI32(out) = ASUI32(lhs) - ASUI32(rhs);
      return;
    default:
      *result_code = 1;
  }
}

// Computes a single element of an x32u opcode. Most are float ops. On error,
// should set |*result_code| to a non-zero value (but should not touch it
// otherwise).
static void iree_uk_generic_x32u_op(iree_uk_x32u_opcode_t opcode,
                                    int* result_code,
                                    const iree_uk_uint32_t* in,
                                    iree_uk_uint32_t* out) {
  switch (opcode) {
    case IREE_UK_X32U_ABSF:
      ASF32(out) = fabsf(ASF32(in));
      return;
    case IREE_UK_X32U_CEILF:
      ASF32(out) = ceilf(ASF32(in));
      return;
    case IREE_UK_X32U_CTLZ:
      ASUI32(out) = iree_uk_count_leading_zeros_u32(ASUI32(in));
      return;
    case IREE_UK_X32U_EXPF:
      ASF32(out) = expf(ASF32(in));
      return;
    case IREE_UK_X32U_FLOORF:
      ASF32(out) = floorf(ASF32(in));
      return;
    case IREE_UK_X32U_LOGF:
      ASF32(out) = logf(ASF32(in));
      return;
    case IREE_UK_X32U_NEGF:
      ASF32(out) = -ASF32(in);
      return;
    case IREE_UK_X32U_RSQRTF:
      ASF32(out) = 1.0f / sqrtf(ASF32(in));
      return;
    default:
      *result_code = 1;
  }
}

//===----------------------------------------------------------------------===//
// Opcode dispatch entry points.
//===----------------------------------------------------------------------===//

// 32bit binary kernels.
IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_x32b_2d(
    iree_uk_x32b_opcode_t opcode,
    // LHS.
    const iree_uk_uint32_t* lhs, iree_uk_ssize_t lhs_offset,
    iree_uk_ssize_t lhs_stride0, iree_uk_ssize_t lhs_stride1,
    // RHS
    const iree_uk_uint32_t* rhs, iree_uk_ssize_t rhs_offset,
    iree_uk_ssize_t rhs_stride0, iree_uk_ssize_t rhs_stride1,
    // OUT.
    iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
    iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
    // Sizes.
    iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
  int result_code = 0;

  if (get_iree_uk_x32b_op_type(opcode) != IREE_UK_X32B_NA) {
    size_t vl;
    // make most use of vectorization by swiching dimension
    if (size0 < size1) {
      for (iree_uk_ssize_t i = 0; i < size0; ++i) {
        for (iree_uk_ssize_t j = 0; j < size1; j += vl) {
          vl = vsetvl_e32m8(size1 - j);
          iree_uk_rvv_x32b_op(opcode, &result_code,
                              &lhs[i * lhs_stride0 + j * lhs_stride1],
                              lhs_stride1 * sizeof(uint32_t),
                              &rhs[i * rhs_stride0 + j * rhs_stride1],
                              rhs_stride1 * sizeof(uint32_t),
                              &out[i * out_stride0 + j * out_stride1],
                              out_stride1 * sizeof(uint32_t), vl);
        }
      }
    } else {
      for (iree_uk_ssize_t j = 0; j < size1; ++j) {
        for (iree_uk_ssize_t i = 0; i < size0; i += vl) {
          vl = vsetvl_e32m8(size0 - i);
          iree_uk_rvv_x32b_op(opcode, &result_code,
                              &lhs[i * lhs_stride0 + j * lhs_stride1],
                              lhs_stride0 * sizeof(uint32_t),
                              &rhs[i * rhs_stride0 + j * rhs_stride1],
                              rhs_stride0 * sizeof(uint32_t),
                              &out[i * out_stride0 + j * out_stride1],
                              out_stride0 * sizeof(uint32_t), vl);
        }
      }
    }
  } else {
    for (iree_uk_ssize_t i = 0; i < size0; ++i) {
      for (iree_uk_ssize_t j = 0; j < size1; ++j) {
        iree_uk_generic_x32b_op(opcode, &result_code,
                                &lhs[i * lhs_stride0 + j * lhs_stride1],
                                &rhs[i * rhs_stride0 + j * rhs_stride1],
                                &out[i * out_stride0 + j * out_stride1]);
      }
    }
  }
  return result_code;
}

// Generic 32bit unary kernels.
IREE_UK_ATTRIBUTE_NOINLINE static int iree_uk_generic_x32u_2d(
    iree_uk_x32u_opcode_t opcode,
    // IN.
    const iree_uk_uint32_t* in, iree_uk_ssize_t in_offset,
    iree_uk_ssize_t in_stride0, iree_uk_ssize_t in_stride1,
    // OUT.
    iree_uk_uint32_t* IREE_UK_RESTRICT out, iree_uk_ssize_t out_offset,
    iree_uk_ssize_t out_stride0, iree_uk_ssize_t out_stride1,
    // Sizes.
    iree_uk_ssize_t size0, iree_uk_ssize_t size1) {
  int result_code = 0;
  // TODO: Manually unroll to x4 to trigger vectorization.
  for (iree_uk_ssize_t i = 0; i < size0; ++i) {
    for (iree_uk_ssize_t j = 0; j < size1; ++j) {
      iree_uk_generic_x32u_op(opcode, &result_code,
                              &in[i * in_stride0 + j * in_stride1],
                              &out[i * out_stride0 + j * out_stride1]);
    }
  }
  return result_code;
}
