| #include "sim/kelvin_vector_instructions.h" |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <functional> |
| #include <limits> |
| #include <type_traits> |
| |
| #include "sim/kelvin_state.h" |
| #include "absl/functional/bind_front.h" |
| #include "absl/log/check.h" |
| #include "absl/numeric/bits.h" |
| #include "absl/types/span.h" |
| #include "riscv/riscv_register.h" |
| #include "mpact/sim/generic/data_buffer.h" |
| #include "mpact/sim/generic/instruction.h" |
| |
| namespace kelvin::sim { |
| |
| using mpact::sim::generic::DataBuffer; |
| using mpact::sim::generic::GetInstructionSource; |
| using mpact::sim::riscv::RV32VectorDestinationOperand; |
| |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vd BinaryOpInvoke(std::function<Vd(Vs1, Vs2)> op, Vd vd, Vs1 vs1, Vs2 vs2) { |
| return op(vs1, vs2); |
| } |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vd BinaryOpInvoke(std::function<Vd(Vd, Vs1, Vs2)> op, Vd vd, Vs1 vs1, Vs2 vs2) { |
| return op(vd, vs1, vs2); |
| } |
| |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs1 CommonBinaryOpGetArg1(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, |
| int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(Vs1); |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index * sizeof(Vd) / sizeof(Vs1); |
| if (sizeof(Vd) == sizeof(Vs1) && sizeof(Vs1) == 2 * sizeof(Vs2)) { |
| // special case for VAcc instructions, which uses double the amount |
| // of registers for Vs1, because it's 2x the size of Vs2. |
| src_element_index += num_ops * elts_per_register * dst_reg_index; |
| } else { |
| src_element_index += dst_reg_index; |
| } |
| return GetInstructionSource<Vs1>(inst, 0, src_element_index); |
| } |
| |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs2 CommonBinaryOpGetArg2(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, |
| int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(Vs2); |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index * sizeof(Vd) / sizeof(Vs2) + |
| dst_reg_index; |
| return GetInstructionSource<Vs2>(inst, 1, scalar ? 0 : src_element_index); |
| } |
| |
| template <typename T, typename Vd, typename Vs1, typename Vs2> |
| using SourceArgGetter = |
| std::function<T(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, int dst_reg_index)>; |
| |
| template <bool halftype = false, bool widen_dst = false, typename Vd, |
| typename Vs1, typename Vs2, typename... VDArgs> |
| void KelvinBinaryVectorOp(const Instruction *inst, bool scalar, bool strip_mine, |
| std::function<Vd(VDArgs..., Vs1, Vs2)> op, |
| SourceArgGetter<Vs1, Vd, Vs1, Vs2> arg1_getter = |
| CommonBinaryOpGetArg1<Vd, Vs1, Vs2>, |
| SourceArgGetter<Vs2, Vd, Vs1, Vs2> arg2_getter = |
| CommonBinaryOpGetArg2<Vd, Vs1, Vs2>) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_dest_register = vector_size_in_bytes / sizeof(Vd); |
| |
| // For kelvin, stripmining issues 4 contiguous vector ops. |
| auto num_ops = strip_mine ? 4 : 1; |
| constexpr bool is_widen_op = |
| (sizeof(Vd) > sizeof(Vs2) && !halftype) || widen_dst; |
| // Widening requires 2 destination regs per op. |
| constexpr size_t dest_regs_per_op = is_widen_op ? 2 : 1; |
| // Special case for VADD3 op which is adding dest value to vs1 + vs2. |
| constexpr bool is_reading_dest = sizeof...(VDArgs) == 1; |
| auto vd = static_cast<RV32VectorDestinationOperand *>(inst->Destination(0)); |
| |
| for (int op_index = 0; op_index < num_ops; ++op_index) { |
| DataBuffer *dest_db[dest_regs_per_op]; |
| absl::Span<Vd> dest_span[dest_regs_per_op]; |
| |
| for (int i = 0; i < dest_regs_per_op; ++i) { |
| dest_db[i] = is_reading_dest |
| ? vd->CopyDataBuffer(op_index + i * num_ops) |
| : vd->AllocateDataBuffer(op_index + i * num_ops); |
| dest_span[i] = dest_db[i]->template Get<Vd>(); |
| } |
| |
| for (int dst_element_index = 0; dst_element_index < elts_per_dest_register; |
| ++dst_element_index) { |
| for (int dst_reg_index = 0; dst_reg_index < dest_regs_per_op; |
| ++dst_reg_index) { |
| auto arg1 = arg1_getter(inst, scalar, num_ops, op_index, |
| dst_element_index, dst_reg_index); |
| auto arg2 = arg2_getter(inst, scalar, num_ops, op_index, |
| dst_element_index, dst_reg_index); |
| dest_span[dst_reg_index][dst_element_index] = BinaryOpInvoke( |
| op, dest_span[dst_reg_index][dst_element_index], arg1, arg2); |
| } |
| } |
| |
| for (int i = 0; i < dest_regs_per_op; ++i) { |
| dest_db[i]->Submit(); |
| } |
| } |
| } |
| |
| template <typename Vd, typename Vs> |
| void KelvinUnaryVectorOp(const Instruction *inst, bool strip_mine, |
| std::function<Vd(Vs)> op, |
| SourceArgGetter<Vs, Vd, Vs, Vs> arg_getter = |
| CommonBinaryOpGetArg1<Vd, Vs, Vs>) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_dest_register = vector_size_in_bytes / sizeof(Vd); |
| |
| // For kelvin, stripmining issues 4 contiguous vector ops. |
| auto num_ops = strip_mine ? 4 : 1; |
| auto vd = static_cast<RV32VectorDestinationOperand *>(inst->Destination(0)); |
| |
| for (int op_index = 0; op_index < num_ops; ++op_index) { |
| DataBuffer *dest_db = vd->AllocateDataBuffer(op_index); |
| absl::Span<Vd> dest_span = dest_db->template Get<Vd>(); |
| |
| for (int dst_element_index = 0; dst_element_index < elts_per_dest_register; |
| ++dst_element_index) { |
| auto arg = arg_getter(inst, false /* scalar */, num_ops, op_index, |
| dst_element_index, 0 /* dst_reg_index */); |
| dest_span[dst_element_index] = op(arg); |
| } |
| |
| dest_db->Submit(); |
| } |
| } |
| |
| template <typename T> |
| void KelvinVAdd(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return vs1 + vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| using UT = typename std::make_unsigned<T>::type; |
| // Cast to unsigned type before the operation to avoid |
| // undefined overflow behavior in intx_t. |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| return static_cast<T>(uvs1 + uvs2); |
| })); |
| } |
| template void KelvinVAdd<int8_t>(bool, bool, Instruction *); |
| template void KelvinVAdd<int16_t>(bool, bool, Instruction *); |
| template void KelvinVAdd<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVSub(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return vs1 - vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| using UT = typename std::make_unsigned<T>::type; |
| // Cast to unsigned type before the operation to avoid |
| // undefined overflow behavior in intx_t. |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| return static_cast<T>(uvs1 - uvs2); |
| })); |
| } |
| template void KelvinVSub<int8_t>(bool, bool, Instruction *); |
| template void KelvinVSub<int16_t>(bool, bool, Instruction *); |
| template void KelvinVSub<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVRSub(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return vs2 - vs1. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| using UT = typename std::make_unsigned<T>::type; |
| // Cast to unsigned type before the operation to avoid |
| // undefined overflow behavior in intx_t. |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| return static_cast<T>(uvs2 - uvs1); |
| })); |
| } |
| template void KelvinVRSub<int8_t>(bool, bool, Instruction *); |
| template void KelvinVRSub<int16_t>(bool, bool, Instruction *); |
| template void KelvinVRSub<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVEq(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return 1 if vs1 and vs2 are equal, else returns 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 == vs2; })); |
| } |
| template void KelvinVEq<int8_t>(bool, bool, Instruction *); |
| template void KelvinVEq<int16_t>(bool, bool, Instruction *); |
| template void KelvinVEq<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVNe(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return 1 if vs1 and vs2 are not equal, else return 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 != vs2; })); |
| } |
| template void KelvinVNe<int8_t>(bool, bool, Instruction *); |
| template void KelvinVNe<int16_t>(bool, bool, Instruction *); |
| template void KelvinVNe<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVLt(bool scalar, bool strip_mine, Instruction *inst) { |
| // Returns 1 if vs1 < vs2, else return 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 < vs2; })); |
| } |
| template void KelvinVLt<int8_t>(bool, bool, Instruction *); |
| template void KelvinVLt<int16_t>(bool, bool, Instruction *); |
| template void KelvinVLt<int32_t>(bool, bool, Instruction *); |
| template void KelvinVLt<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVLt<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVLt<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVLe(bool scalar, bool strip_mine, Instruction *inst) { |
| // Returns 1 if vs1 <= vs2, else return 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 <= vs2; })); |
| } |
| template void KelvinVLe<int8_t>(bool, bool, Instruction *); |
| template void KelvinVLe<int16_t>(bool, bool, Instruction *); |
| template void KelvinVLe<int32_t>(bool, bool, Instruction *); |
| template void KelvinVLe<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVLe<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVLe<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVGt(bool scalar, bool strip_mine, Instruction *inst) { |
| // Returns 1 if vs1 > vs2, else return 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 > vs2; })); |
| } |
| template void KelvinVGt<int8_t>(bool, bool, Instruction *); |
| template void KelvinVGt<int16_t>(bool, bool, Instruction *); |
| template void KelvinVGt<int32_t>(bool, bool, Instruction *); |
| template void KelvinVGt<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVGt<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVGt<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVGe(bool scalar, bool strip_mine, Instruction *inst) { |
| // Returns 1 if vs1 >= vs2, else return 0. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 >= vs2; })); |
| } |
| template void KelvinVGe<int8_t>(bool, bool, Instruction *); |
| template void KelvinVGe<int16_t>(bool, bool, Instruction *); |
| template void KelvinVGe<int32_t>(bool, bool, Instruction *); |
| template void KelvinVGe<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVGe<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVGe<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVAbsd(bool scalar, bool strip_mine, Instruction *inst) { |
| // Returns the absolute difference between vs1 and vs2. |
| // Note: for signed(INTx_MAX - INTx_MIN) the result will be UINTx_MAX. |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, |
| typename std::make_unsigned<T>::type, T, T>( |
| inst, scalar, strip_mine, |
| std::function<typename std::make_unsigned<T>::type(T, T)>( |
| [](T vs1, T vs2) -> typename std::make_unsigned<T>::type { |
| using UT = typename std::make_unsigned<T>::type; |
| // Cast to unsigned type before the operation to avoid undefined |
| // overflow behavior in intx_t. |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| return vs1 > vs2 ? uvs1 - uvs2 : uvs2 - uvs1; |
| })); |
| } |
| template void KelvinVAbsd<int8_t>(bool, bool, Instruction *); |
| template void KelvinVAbsd<int16_t>(bool, bool, Instruction *); |
| template void KelvinVAbsd<int32_t>(bool, bool, Instruction *); |
| template void KelvinVAbsd<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVAbsd<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVAbsd<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVMax(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return the max of vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| return std::max(vs1, vs2); |
| })); |
| } |
| template void KelvinVMax<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMax<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMax<int32_t>(bool, bool, Instruction *); |
| template void KelvinVMax<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVMax<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVMax<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVMin(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return the min of vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| return std::min(vs1, vs2); |
| })); |
| } |
| template void KelvinVMin<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMin<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMin<int32_t>(bool, bool, Instruction *); |
| template void KelvinVMin<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVMin<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVMin<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVAdd3(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return the summation of vd, vs1, and vs2. |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T, T)>([](T vd, T vs1, T vs2) -> T { |
| using UT = typename std::make_unsigned<T>::type; |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| UT uvd = static_cast<UT>(vd); |
| return static_cast<T>(uvd + uvs1 + uvs2); |
| })); |
| } |
| template void KelvinVAdd3<int8_t>(bool, bool, Instruction *); |
| template void KelvinVAdd3<int16_t>(bool, bool, Instruction *); |
| template void KelvinVAdd3<int32_t>(bool, bool, Instruction *); |
| |
| // Helper function for Vadds (saturated signed addition). |
| // Uses unsigned arithmetic for the addition to avoid signed overflow, which, |
| // when compiled with --config=asan, will trigger an exception. |
| template <typename T> |
| inline T VAddsHelper(T vs1, T vs2) { |
| using UT = typename std::make_unsigned<T>::type; |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| UT usum = uvs1 + uvs2; |
| T sum = static_cast<T>(usum); |
| if (((vs1 ^ vs2) >= 0) && ((sum ^ vs1) < 0)) { |
| return vs1 > 0 ? std::numeric_limits<T>::max() |
| : std::numeric_limits<T>::min(); |
| } |
| return sum; |
| } |
| |
| template <typename T> |
| void KelvinVAdds(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return saturated sum of vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>(VAddsHelper<T>)); |
| } |
| template void KelvinVAdds<int8_t>(bool, bool, Instruction *); |
| template void KelvinVAdds<int16_t>(bool, bool, Instruction *); |
| template void KelvinVAdds<int32_t>(bool, bool, Instruction *); |
| |
| // Helper function for Vaddsu (saturated unsigned addition). |
| template <typename T> |
| inline T VAddsuHelper(T vs1, T vs2) { |
| T sum = vs1 + vs2; |
| if (sum < vs1) { |
| sum = std::numeric_limits<T>::max(); |
| } |
| return sum; |
| } |
| |
| template <typename T> |
| void KelvinVAddsu(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return saturated sum of unsigned vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>(VAddsuHelper<T>)); |
| } |
| template void KelvinVAddsu<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVAddsu<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVAddsu<uint32_t>(bool, bool, Instruction *); |
| |
| // Helper function for Vsubs (saturated signed subtraction). |
| template <typename T> |
| inline T VSubsHelper(T vs1, T vs2) { |
| using UT = typename std::make_unsigned<T>::type; |
| UT uvs1 = static_cast<UT>(vs1); |
| UT uvs2 = static_cast<UT>(vs2); |
| UT usub = uvs1 - uvs2; |
| T sub = static_cast<T>(usub); |
| if (((vs1 ^ vs2) < 0) && ((sub ^ vs2) >= 0)) { |
| return vs2 < 0 ? std::numeric_limits<T>::max() |
| : std::numeric_limits<T>::min(); |
| } |
| return sub; |
| } |
| |
| template <typename T> |
| void KelvinVSubs(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return saturated sub of vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>(VSubsHelper<T>)); |
| } |
| template void KelvinVSubs<int8_t>(bool, bool, Instruction *); |
| template void KelvinVSubs<int16_t>(bool, bool, Instruction *); |
| template void KelvinVSubs<int32_t>(bool, bool, Instruction *); |
| |
| template <typename T> |
| void KelvinVSubsu(bool scalar, bool strip_mine, Instruction *inst) { |
| // Return saturated sub of unsigned vs1 and vs2. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| return vs1 < vs2 ? 0 : vs1 - vs2; |
| })); |
| } |
| template void KelvinVSubsu<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVSubsu<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVSubsu<uint32_t>(bool, bool, Instruction *); |
| |
| template <typename Td, typename Ts> |
| void KelvinVAddw(bool scalar, bool strip_mine, Instruction *inst) { |
| // Adds operands with widening. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<Td(Ts, Ts)>([](Ts vs1, Ts vs2) -> Td { |
| return static_cast<Td>(vs1) + static_cast<Td>(vs2); |
| })); |
| } |
| template void KelvinVAddw<int16_t, int8_t>(bool, bool, Instruction *); |
| template void KelvinVAddw<int32_t, int16_t>(bool, bool, Instruction *); |
| template void KelvinVAddw<uint16_t, uint8_t>(bool, bool, Instruction *); |
| template void KelvinVAddw<uint32_t, uint16_t>(bool, bool, Instruction *); |
| |
| template <typename Td, typename Ts> |
| void KelvinVSubw(bool scalar, bool strip_mine, Instruction *inst) { |
| // Subtracts operands with widening. |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<Td(Ts, Ts)>([](Ts vs1, Ts vs2) -> Td { |
| return static_cast<Td>(vs1) - static_cast<Td>(vs2); |
| })); |
| } |
| template void KelvinVSubw<int16_t, int8_t>(bool, bool, Instruction *); |
| template void KelvinVSubw<int32_t, int16_t>(bool, bool, Instruction *); |
| template void KelvinVSubw<uint16_t, uint8_t>(bool, bool, Instruction *); |
| template void KelvinVSubw<uint32_t, uint16_t>(bool, bool, Instruction *); |
| |
| template <typename Td, typename Ts2> |
| void KelvinVAcc(bool scalar, bool strip_mine, Instruction *inst) { |
| // Accumulates operands with widening. |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<Td(Td, Ts2)>([](Td vs1, Ts2 vs2) -> Td { |
| using UTd = typename std::make_unsigned<Td>::type; |
| return static_cast<Td>(static_cast<UTd>(vs1) + static_cast<UTd>(vs2)); |
| })); |
| } |
| template void KelvinVAcc<int16_t, int8_t>(bool, bool, Instruction *); |
| template void KelvinVAcc<int32_t, int16_t>(bool, bool, Instruction *); |
| template void KelvinVAcc<uint16_t, uint8_t>(bool, bool, Instruction *); |
| template void KelvinVAcc<uint32_t, uint16_t>(bool, bool, Instruction *); |
| |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs1 PackedBinaryOpGetArg1(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, |
| int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(Vs1); |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index * sizeof(Vd) / sizeof(Vs1); |
| return GetInstructionSource<Vs1>(inst, 0, src_element_index); |
| } |
| |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs2 PackedBinaryOpGetArg2(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, |
| int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(Vs2); |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index * sizeof(Vd) / sizeof(Vs2) + 1; |
| return GetInstructionSource<Vs2>(inst, 0, src_element_index); |
| } |
| |
| template <typename Td, typename Ts> |
| void KelvinVPadd(bool strip_mine, Instruction *inst) { |
| // Adds lane pairs. |
| KelvinBinaryVectorOp<true /* halftype */, false /* widen_dst */, Td, Ts, Ts>( |
| inst, false /* scalar */, strip_mine, |
| std::function<Td(Ts, Ts)>([](Ts vs1, Ts vs2) -> Td { |
| return static_cast<Td>(vs1) + static_cast<Td>(vs2); |
| }), |
| SourceArgGetter<Ts, Td, Ts, Ts>(PackedBinaryOpGetArg1<Td, Ts, Ts>), |
| SourceArgGetter<Ts, Td, Ts, Ts>(PackedBinaryOpGetArg2<Td, Ts, Ts>)); |
| } |
| template void KelvinVPadd<int16_t, int8_t>(bool, Instruction *); |
| template void KelvinVPadd<int32_t, int16_t>(bool, Instruction *); |
| template void KelvinVPadd<uint16_t, uint8_t>(bool, Instruction *); |
| template void KelvinVPadd<uint32_t, uint16_t>(bool, Instruction *); |
| |
| template <typename Td, typename Ts> |
| void KelvinVPsub(bool strip_mine, Instruction *inst) { |
| // Subtracts lane pairs. |
| KelvinBinaryVectorOp<true /* halftype */, false /* widen_dst */, Td, Ts, Ts>( |
| inst, false /* scalar */, strip_mine, |
| std::function<Td(Ts, Ts)>([](Ts vs1, Ts vs2) -> Td { |
| return static_cast<Td>(vs1) - static_cast<Td>(vs2); |
| }), |
| SourceArgGetter<Ts, Td, Ts, Ts>(PackedBinaryOpGetArg1<Td, Ts, Ts>), |
| SourceArgGetter<Ts, Td, Ts, Ts>(PackedBinaryOpGetArg2<Td, Ts, Ts>)); |
| } |
| template void KelvinVPsub<int16_t, int8_t>(bool, Instruction *); |
| template void KelvinVPsub<int32_t, int16_t>(bool, Instruction *); |
| template void KelvinVPsub<uint16_t, uint8_t>(bool, Instruction *); |
| template void KelvinVPsub<uint32_t, uint16_t>(bool, Instruction *); |
| |
| // Halving addition with optional rounding bit. |
| template <typename T> |
| T KelvinVHaddHelper(bool round, T vs1, T vs2) { |
| if (std::is_signed<T>::value) { |
| return static_cast<T>((static_cast<int64_t>(vs1) + |
| static_cast<int64_t>(vs2) + (round ? 1 : 0)) >> |
| 1); |
| } else { |
| return static_cast<T>((static_cast<uint64_t>(vs1) + |
| static_cast<uint64_t>(vs2) + (round ? 1 : 0)) >> |
| 1); |
| } |
| } |
| |
| template <typename T> |
| void KelvinVHadd(bool scalar, bool strip_mine, bool round, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>(absl::bind_front(&KelvinVHaddHelper<T>, round))); |
| } |
| template void KelvinVHadd<int8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHadd<int16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHadd<int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHadd<uint8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHadd<uint16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHadd<uint32_t>(bool, bool, bool, Instruction *); |
| |
| // Halving subtraction with optional rounding bit. |
| template <typename T> |
| T KelvinVHsubHelper(bool round, T vs1, T vs2) { |
| if (std::is_signed<T>::value) { |
| return static_cast<T>((static_cast<int64_t>(vs1) - |
| static_cast<int64_t>(vs2) + (round ? 1 : 0)) >> |
| 1); |
| } else { |
| return static_cast<T>((static_cast<uint64_t>(vs1) - |
| static_cast<uint64_t>(vs2) + (round ? 1 : 0)) >> |
| 1); |
| } |
| } |
| |
| template <typename T> |
| void KelvinVHsub(bool scalar, bool strip_mine, bool round, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>(absl::bind_front(&KelvinVHsubHelper<T>, round))); |
| } |
| template void KelvinVHsub<int8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHsub<int16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHsub<int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHsub<uint8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHsub<uint16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVHsub<uint32_t>(bool, bool, bool, Instruction *); |
| |
| // Bitwise and. |
| template <typename T> |
| void KelvinVAnd(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 & vs2; })); |
| } |
| template void KelvinVAnd<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVAnd<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVAnd<uint32_t>(bool, bool, Instruction *); |
| |
| // Bitwise or. |
| template <typename T> |
| void KelvinVOr(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 | vs2; })); |
| } |
| template void KelvinVOr<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVOr<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVOr<uint32_t>(bool, bool, Instruction *); |
| |
| // Bitwise xor. |
| template <typename T> |
| void KelvinVXor(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1 ^ vs2; })); |
| } |
| template void KelvinVXor<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVXor<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVXor<uint32_t>(bool, bool, Instruction *); |
| |
| // Generalized reverse using bit ladder. |
| template <typename T> |
| void KelvinVRev(bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, true /* scalar */, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| T r = vs1; |
| // TODO(leonidl): revisit after spec clarification. |
| // For now it's set to always use 5 lower bits, regardless of type. |
| T count = vs2 & 0b11111; |
| if (count & 1) r = ((r & 0x55555555) << 1) | ((r & 0xAAAAAAAA) >> 1); |
| if (count & 2) r = ((r & 0x33333333) << 2) | ((r & 0xCCCCCCCC) >> 2); |
| if (count & 4) r = ((r & 0x0F0F0F0F) << 4) | ((r & 0xF0F0F0F0) >> 4); |
| if (count & 8) r = ((r & 0x00FF00FF) << 8) | ((r & 0xFF00FF00) >> 8); |
| if (count & 16) r = ((r & 0x0000FFFF) << 16) | ((r & 0xFFFF0000) >> 16); |
| return r; |
| })); |
| } |
| template void KelvinVRev<uint8_t>(bool, Instruction *); |
| template void KelvinVRev<uint16_t>(bool, Instruction *); |
| template void KelvinVRev<uint32_t>(bool, Instruction *); |
| |
| // Cyclic rotation right using a bit ladder. |
| template <typename T> |
| void KelvinVRor(bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, true /* scalar */, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| T r = vs1; |
| T count = vs2 & static_cast<T>(sizeof(T) * 8 - 1); |
| for (auto shift : {1, 2, 4, 8, 16}) { |
| if (count & shift) |
| r = (r >> shift) | (r << (sizeof(T) * 8 - shift)); |
| } |
| return r; |
| })); |
| } |
| template void KelvinVRor<uint8_t>(bool, Instruction *); |
| template void KelvinVRor<uint16_t>(bool, Instruction *); |
| template void KelvinVRor<uint32_t>(bool, Instruction *); |
| |
| // Returns Arg1 as either vs1 or vs2 based on dst_reg_index. |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs1 VMvpOpGetArg1(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, int dst_reg_index) { |
| return dst_reg_index == 0 |
| ? CommonBinaryOpGetArg1<Vd, Vs1, Vs2>( |
| inst, scalar, num_ops, op_index, dst_element_index, 0) |
| : CommonBinaryOpGetArg2<Vd, Vs1, Vs2>( |
| inst, scalar, num_ops, op_index, dst_element_index, 0); |
| } |
| |
| // Copies a pair of registers. |
| template <typename T> |
| void KelvinVMvp(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, true /* widen_dst */, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(VMvpOpGetArg1<T, T, T>), |
| // Arg2 isn't used. We provide a custom getter here because the default |
| // getter expects extra source registers for widening ops. |
| SourceArgGetter<T, T, T, T>(VMvpOpGetArg1<T, T, T>)); |
| } |
| template void KelvinVMvp<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVMvp<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVMvp<uint32_t>(bool, bool, Instruction *); |
| |
| // Logical shift left. |
| template <typename T> |
| void KelvinVSll(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| size_t shift = vs2 & (sizeof(T) * 8 - 1); |
| return vs1 << shift; |
| })); |
| } |
| template void KelvinVSll<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVSll<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVSll<uint32_t>(bool, bool, Instruction *); |
| |
| // Arithmetic shift right. |
| template <typename T> |
| void KelvinVSra(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| size_t shift = vs2 & (sizeof(T) * 8 - 1); |
| return vs1 >> shift; |
| })); |
| } |
| template void KelvinVSra<int8_t>(bool, bool, Instruction *); |
| template void KelvinVSra<int16_t>(bool, bool, Instruction *); |
| template void KelvinVSra<int32_t>(bool, bool, Instruction *); |
| |
| // Logical shift right. |
| template <typename T> |
| void KelvinVSrl(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| size_t shift = vs2 & (sizeof(T) * 8 - 1); |
| return vs1 >> shift; |
| })); |
| } |
| template void KelvinVSrl<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVSrl<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVSrl<uint32_t>(bool, bool, Instruction *); |
| |
| // Logical and arithmetic left/right shift with saturating shift amount and |
| // result. |
| template <typename T> |
| T KelvinVShiftHelper(bool round, T vs1, T vs2) { |
| if (std::is_signed<T>::value == true) { |
| constexpr int n = sizeof(T) * 8; |
| int shamt = vs2; |
| int64_t s = vs1; |
| if (!vs1) { |
| return 0; |
| } else if (vs1 < 0 && shamt >= n) { |
| s = -1 + round; |
| } else if (vs1 > 0 && shamt >= n) { |
| s = 0; |
| } else if (shamt > 0) { |
| s = (static_cast<int64_t>(vs1) + (round ? (1ll << (shamt - 1)) : 0)) >> |
| shamt; |
| } else { // shamt < 0 |
| using UT = typename std::make_unsigned<T>::type; |
| UT ushamt = static_cast<UT>(-shamt <= n ? -shamt : n); |
| CHECK_LE(ushamt, n); |
| CHECK_GE(ushamt, 0); |
| s = static_cast<int64_t>(static_cast<uint64_t>(vs1) << ushamt); |
| } |
| T neg_max = std::numeric_limits<T>::min(); |
| T pos_max = std::numeric_limits<T>::max(); |
| bool neg_sat = vs1 < 0 && (shamt <= -n || s < neg_max); |
| bool pos_sat = vs1 > 0 && (shamt <= -n || s > pos_max); |
| if (neg_sat) return neg_max; |
| if (pos_sat) return pos_max; |
| return s; |
| } else { |
| constexpr int n = sizeof(T) * 8; |
| // Shift can be positive/negative. |
| int shamt = static_cast<typename std::make_signed<T>::type>(vs2); |
| uint64_t s = vs1; |
| if (!vs1) { |
| return 0; |
| } else if (shamt > n) { |
| s = 0; |
| } else if (shamt > 0) { |
| s = (static_cast<uint64_t>(vs1) + (round ? (1ull << (shamt - 1)) : 0)) >> |
| shamt; |
| } else { |
| using UT = typename std::make_unsigned<T>::type; |
| UT ushamt = static_cast<UT>(-shamt <= n ? -shamt : n); |
| s = static_cast<uint64_t>(vs1) << (ushamt); |
| } |
| T pos_max = std::numeric_limits<T>::max(); |
| bool pos_sat = vs1 && (shamt < -n || s > pos_max); |
| if (pos_sat) return pos_max; |
| return s; |
| } |
| } |
| |
| template <typename T> |
| void KelvinVShift(bool round, bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>(absl::bind_front(&KelvinVShiftHelper<T>, round))); |
| } |
| template void KelvinVShift<int8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVShift<int16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVShift<int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVShift<uint8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVShift<uint16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVShift<uint32_t>(bool, bool, bool, Instruction *); |
| |
| // Bitwise not. |
| template <typename T> |
| void KelvinVNot(bool strip_mine, Instruction *inst) { |
| KelvinUnaryVectorOp(inst, strip_mine, |
| std::function<T(T)>([](T vs) -> T { return ~vs; })); |
| } |
| template void KelvinVNot<int32_t>(bool, Instruction *); |
| |
| // Count the leading bits. |
| template <typename T> |
| void KelvinVClb(bool strip_mine, Instruction *inst) { |
| KelvinUnaryVectorOp(inst, strip_mine, std::function<T(T)>([](T vs) -> T { |
| return (vs & (1u << (sizeof(T) * 8 - 1))) |
| ? absl::countl_one(vs) |
| : absl::countl_zero(vs); |
| })); |
| } |
| template void KelvinVClb<uint8_t>(bool, Instruction *); |
| template void KelvinVClb<uint16_t>(bool, Instruction *); |
| template void KelvinVClb<uint32_t>(bool, Instruction *); |
| |
| // Count the leading zeros. |
| template <typename T> |
| void KelvinVClz(bool strip_mine, Instruction *inst) { |
| KelvinUnaryVectorOp(inst, strip_mine, std::function<T(T)>([](T vs) -> T { |
| return absl::countl_zero(vs); |
| })); |
| } |
| template void KelvinVClz<uint8_t>(bool, Instruction *); |
| template void KelvinVClz<uint16_t>(bool, Instruction *); |
| template void KelvinVClz<uint32_t>(bool, Instruction *); |
| |
| // Count the set bits. |
| template <typename T> |
| void KelvinVCpop(bool strip_mine, Instruction *inst) { |
| KelvinUnaryVectorOp(inst, strip_mine, std::function<T(T)>([](T vs) -> T { |
| return absl::popcount(vs); |
| })); |
| } |
| template void KelvinVCpop<uint8_t>(bool, Instruction *); |
| template void KelvinVCpop<uint16_t>(bool, Instruction *); |
| template void KelvinVCpop<uint32_t>(bool, Instruction *); |
| |
| // Move a register. |
| template <typename T> |
| void KelvinVMv(bool strip_mine, Instruction *inst) { |
| KelvinUnaryVectorOp(inst, strip_mine, |
| std::function<T(T)>([](T vs) -> T { return vs; })); |
| } |
| template void KelvinVMv<int32_t>(bool, Instruction *); |
| |
| // Alternates Vs1 register used for odd/even destination indices. |
| template <typename Vd, typename Vs1, typename Vs2> |
| Vs1 VSransOpGetArg1(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, int dst_reg_index) { |
| static_assert(2 * sizeof(Vd) == sizeof(Vs1) || 4 * sizeof(Vd) == sizeof(Vs1)); |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(Vs1); |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index * sizeof(Vd) / sizeof(Vs1); |
| |
| if (sizeof(Vs1) / sizeof(Vd) == 2) { |
| src_element_index += |
| dst_element_index & 1 ? num_ops * elts_per_register : 0; |
| } else { // sizeof(Vs1) / sizeof(Vd) == 4 |
| const int interleave[4] = {0, 2, 1, 3}; |
| src_element_index += |
| interleave[dst_element_index & 3] * num_ops * elts_per_register; |
| } |
| |
| return GetInstructionSource<Vs1>(inst, 0, src_element_index); |
| } |
| |
| // Arithmetic right shift with rounding and signed/unsigned saturation. |
| // Narrowing x2 or x4. |
| template <typename Td, typename Ts> |
| Td KelvinVSransHelper(bool round, Ts vs1, Td vs2) { |
| static_assert(2 * sizeof(Td) == sizeof(Ts) || 4 * sizeof(Td) == sizeof(Ts)); |
| constexpr int src_bits = sizeof(Ts) * 8; |
| vs2 &= (src_bits - 1); |
| |
| int64_t res = |
| (static_cast<int64_t>(vs1) + (vs2 && round ? (1ll << (vs2 - 1)) : 0)) >> |
| vs2; |
| |
| bool neg_sat = res < std::numeric_limits<Td>::min(); |
| bool pos_sat = res > std::numeric_limits<Td>::max(); |
| bool zero = !vs1; |
| if (neg_sat) return std::numeric_limits<Td>::min(); |
| if (pos_sat) return std::numeric_limits<Td>::max(); |
| if (zero) return 0; |
| return res; |
| } |
| |
| template <typename Td, typename Ts> |
| void KelvinVSrans(bool round, bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<Td(Ts, Td)>( |
| absl::bind_front(&KelvinVSransHelper<Td, Ts>, round)), |
| SourceArgGetter<Ts, Td, Ts, Td>(VSransOpGetArg1<Td, Ts, Td>)); |
| } |
| template void KelvinVSrans<int8_t, int16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVSrans<int16_t, int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVSrans<uint8_t, uint16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVSrans<uint16_t, uint32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVSrans<int8_t, int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVSrans<uint8_t, uint32_t>(bool, bool, bool, Instruction *); |
| |
| // Multiplication of vector elements. |
| template <typename T> |
| void KelvinVMul(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| if (std::is_signed<T>::value) { |
| return static_cast<T>(static_cast<int64_t>(vs1) * |
| static_cast<int64_t>(vs2)); |
| } else { |
| return static_cast<T>(static_cast<uint64_t>(vs1) * |
| static_cast<uint64_t>(vs2)); |
| } |
| })); |
| } |
| template void KelvinVMul<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMul<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMul<int32_t>(bool, bool, Instruction *); |
| |
| // Multiplication of vector elements with saturation. |
| template <typename T> |
| void KelvinVMuls(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, std::function<T(T, T)>([](T vs1, T vs2) -> T { |
| if (std::is_signed<T>::value) { |
| int64_t result = |
| static_cast<int64_t>(vs1) * static_cast<int64_t>(vs2); |
| result = std::max( |
| static_cast<int64_t>(std::numeric_limits<T>::min()), |
| std::min(static_cast<int64_t>(std::numeric_limits<T>::max()), |
| result)); |
| return result; |
| } else { |
| uint64_t result = |
| static_cast<uint64_t>(vs1) * static_cast<uint64_t>(vs2); |
| result = std::min( |
| static_cast<uint64_t>(std::numeric_limits<T>::max()), result); |
| return result; |
| } |
| })); |
| } |
| template void KelvinVMuls<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMuls<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMuls<int32_t>(bool, bool, Instruction *); |
| template void KelvinVMuls<uint8_t>(bool, bool, Instruction *); |
| template void KelvinVMuls<uint16_t>(bool, bool, Instruction *); |
| template void KelvinVMuls<uint32_t>(bool, bool, Instruction *); |
| |
| // Multiplication of vector elements with widening. |
| template <typename Td, typename Ts> |
| void KelvinVMulw(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<Td(Ts, Ts)>([](Ts vs1, Ts vs2) -> Td { |
| return static_cast<Td>(vs1) * static_cast<Td>(vs2); |
| })); |
| } |
| template void KelvinVMulw<int16_t, int8_t>(bool, bool, Instruction *); |
| template void KelvinVMulw<int32_t, int16_t>(bool, bool, Instruction *); |
| template void KelvinVMulw<uint16_t, uint8_t>(bool, bool, Instruction *); |
| template void KelvinVMulw<uint32_t, uint16_t>(bool, bool, Instruction *); |
| |
| // Multiplication of vector elements with widening and optional rounding. |
| // Returns high half. |
| template <typename T> |
| T KelvinVMulhHelper(bool round, T vs1, T vs2) { |
| constexpr int n = sizeof(T) * 8; |
| if (std::is_signed<T>::value) { |
| int64_t result = static_cast<int64_t>(vs1) * static_cast<int64_t>(vs2); |
| result += round ? 1ll << (n - 1) : 0; |
| return static_cast<uint64_t>(result) >> n; |
| } else { |
| uint64_t result = static_cast<uint64_t>(vs1) * static_cast<uint64_t>(vs2); |
| result += round ? 1ull << (n - 1) : 0; |
| return result >> n; |
| } |
| } |
| |
| template <typename T> |
| void KelvinVMulh(bool scalar, bool strip_mine, bool round, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>(absl::bind_front(&KelvinVMulhHelper<T>, round))); |
| } |
| template void KelvinVMulh<int8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVMulh<int16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVMulh<int32_t>(bool, bool, bool, Instruction *); |
| template void KelvinVMulh<uint8_t>(bool, bool, bool, Instruction *); |
| template void KelvinVMulh<uint16_t>(bool, bool, bool, Instruction *); |
| template void KelvinVMulh<uint32_t>(bool, bool, bool, Instruction *); |
| |
| // Saturating signed doubling multiply returning high half with optional |
| // rounding. |
| template <typename T> |
| T KelvinVDmulhHelper(bool round, bool round_neg, T vs1, T vs2) { |
| constexpr int n = sizeof(T) * 8; |
| int64_t result = static_cast<int64_t>(vs1) * static_cast<int64_t>(vs1); |
| if (round) { |
| int64_t rnd = 0x40000000ll >> (32 - n); |
| if (result < 0 && round_neg) { |
| rnd = (-0x40000000ll) >> (32 - n); |
| } |
| result += rnd; |
| } |
| result >>= (n - 1); |
| if (vs1 == std::numeric_limits<T>::min() && |
| vs2 == std::numeric_limits<T>::min()) { |
| result = std::numeric_limits<T>::max(); |
| } |
| return result; |
| } |
| template <typename T> |
| void KelvinVDmulh(bool scalar, bool strip_mine, bool round, bool round_neg, |
| Instruction *inst) { |
| KelvinBinaryVectorOp(inst, scalar, strip_mine, |
| std::function<T(T, T)>(absl::bind_front( |
| &KelvinVDmulhHelper<T>, round, round_neg))); |
| } |
| template void KelvinVDmulh<int8_t>(bool, bool, bool, bool, Instruction *); |
| template void KelvinVDmulh<int16_t>(bool, bool, bool, bool, Instruction *); |
| template void KelvinVDmulh<int32_t>(bool, bool, bool, bool, Instruction *); |
| |
| // Multiply accumulate. |
| template <typename T> |
| void KelvinVMacc(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T, T)>([](T vd, T vs1, T vs2) -> T { |
| return static_cast<int64_t>(vd) + |
| static_cast<int64_t>(vs1) * static_cast<int64_t>(vs2); |
| })); |
| } |
| template void KelvinVMacc<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMacc<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMacc<int32_t>(bool, bool, Instruction *); |
| |
| // Multiply add. |
| template <typename T> |
| void KelvinVMadd(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T, T)>([](T vd, T vs1, T vs2) -> T { |
| return static_cast<int64_t>(vs1) + |
| static_cast<int64_t>(vd) * static_cast<int64_t>(vs2); |
| })); |
| } |
| template void KelvinVMadd<int8_t>(bool, bool, Instruction *); |
| template void KelvinVMadd<int16_t>(bool, bool, Instruction *); |
| template void KelvinVMadd<int32_t>(bool, bool, Instruction *); |
| |
| // Computes slide index for next register and takes result from either vs1 or |
| // vs2. |
| template <typename T> |
| T VSlidenOpGetArg1(bool horizontal, int index, const Instruction *inst, |
| bool scalar, int num_ops, int op_index, |
| int dst_element_index, int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(T); |
| |
| using Interleave = struct { |
| int register_num; |
| int source_arg; |
| }; |
| const Interleave interleave_start[2][4] = {{{0, 0}, {1, 0}, {2, 0}, {3, 0}}, |
| {{0, 0}, {1, 0}, {2, 0}, {3, 0}}}; |
| const Interleave interleave_end[2][4] = {{{0, 1}, {1, 1}, {2, 1}, {3, 1}}, |
| {{1, 0}, {2, 0}, {3, 0}, {0, 1}}}; |
| // Get the elements from the right up to `index`. |
| // For the horizontal mode, it treats the stripmine `vm` register based on |
| // `vs1` as a contiguous block, and only the first `index` elements from `vs2` |
| // will be used. |
| // |
| // For the vertical mode, each stripmine vector register `op_index` is mapped |
| // separatedly. it mimics the imaging tiling process shift of |
| // |--------|--------| |
| // | 4xVLEN | 4xVLEN | |
| // | (vs1) | (vs2) | |
| // |--------|--------| |
| // The vertical mode can also support the non-stripmine version to handle |
| // the last columns of the image. |
| if (dst_element_index + index < elts_per_register) { |
| auto src_element_index = |
| interleave_start[horizontal][op_index].register_num * |
| elts_per_register + |
| dst_element_index + index; |
| return GetInstructionSource<T>( |
| inst, interleave_start[horizontal][op_index].source_arg, |
| src_element_index); |
| } |
| |
| auto src_element_index = |
| interleave_end[horizontal][op_index].register_num * elts_per_register + |
| dst_element_index + index - elts_per_register; |
| return GetInstructionSource<T>( |
| inst, interleave_end[horizontal][op_index].source_arg, src_element_index); |
| } |
| |
| // Slide next register vertically by index. |
| template <typename T> |
| void KelvinVSlidevn(int index, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, false /* scalar */, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(absl::bind_front( |
| VSlidenOpGetArg1<T>, false /* horizontal */, index))); |
| } |
| template void KelvinVSlidevn<int8_t>(int, bool, Instruction *); |
| template void KelvinVSlidevn<int16_t>(int, bool, Instruction *); |
| template void KelvinVSlidevn<int32_t>(int, bool, Instruction *); |
| |
| // Slide next register horizontally by index. |
| template <typename T> |
| void KelvinVSlidehn(int index, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, false /* scalar */, true /* strip_mine */, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>( |
| absl::bind_front(VSlidenOpGetArg1<T>, true /* horizontal */, index))); |
| } |
| template void KelvinVSlidehn<int8_t>(int, Instruction *); |
| template void KelvinVSlidehn<int16_t>(int, Instruction *); |
| template void KelvinVSlidehn<int32_t>(int, Instruction *); |
| |
| // Computes slide index for previous register and takes result from either vs1 |
| // or vs2. |
| template <typename T> |
| T VSlidepOpGetArg1(bool horizontal, int index, const Instruction *inst, |
| bool scalar, int num_ops, int op_index, |
| int dst_element_index, int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| auto elts_per_register = vector_size_in_bytes / sizeof(T); |
| |
| using Interleave = struct { |
| int register_num; |
| int source_arg; |
| }; |
| const Interleave interleave_start[2][4] = {{{0, 0}, {1, 0}, {2, 0}, {3, 0}}, |
| {{3, 0}, {0, 1}, {1, 1}, {2, 1}}}; |
| const Interleave interleave_end[2][4] = {{{0, 1}, {1, 1}, {2, 1}, {3, 1}}, |
| {{0, 1}, {1, 1}, {2, 1}, {3, 1}}}; |
| // Get the elements from the left up to `index`. |
| // For the horizontal mode, it treats the stripmine `vm` register based on |
| // `vs2` as a contiguous block, and only the LAST `index` elements from |
| // stripmine vm register based on `vs1` will be used AT THE BEGINNING. |
| // |
| // For the vertical mode, each stripmine vector register `op_index` is mapped |
| // separatedly. it mimics the imaging tiling process shift of |
| // |--------|--------| |
| // | 4xVLEN | 4xVLEN | |
| // | (vs1) | (vs2) | |
| // |--------|--------| |
| // The vertical mode can also support the non-stripmine version to handle |
| // the last columns of the image. |
| if (dst_element_index < index) { |
| auto src_element_index = |
| interleave_start[horizontal][op_index].register_num * |
| elts_per_register + |
| dst_element_index - index + elts_per_register; |
| return GetInstructionSource<T>( |
| inst, interleave_start[horizontal][op_index].source_arg, |
| src_element_index); |
| } |
| |
| auto src_element_index = |
| interleave_end[horizontal][op_index].register_num * elts_per_register + |
| dst_element_index - index; |
| return GetInstructionSource<T>( |
| inst, interleave_end[horizontal][op_index].source_arg, src_element_index); |
| } |
| |
| // Slide previous register vertically by index. |
| template <typename T> |
| void KelvinVSlidevp(int index, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, false /* scalar */, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(absl::bind_front( |
| VSlidepOpGetArg1<T>, false /* horizontal */, index))); |
| } |
| template void KelvinVSlidevp<int8_t>(int, bool, Instruction *); |
| template void KelvinVSlidevp<int16_t>(int, bool, Instruction *); |
| template void KelvinVSlidevp<int32_t>(int, bool, Instruction *); |
| |
| // Slide previous register horizontally by index. |
| template <typename T> |
| void KelvinVSlidehp(int index, Instruction *inst) { |
| KelvinBinaryVectorOp( |
| inst, false /* scalar */, true /* strip_mine */, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>( |
| absl::bind_front(VSlidepOpGetArg1<T>, true /* horizontal */, index))); |
| } |
| template void KelvinVSlidehp<int8_t>(int, Instruction *); |
| template void KelvinVSlidehp<int16_t>(int, Instruction *); |
| template void KelvinVSlidehp<int32_t>(int, Instruction *); |
| |
| template <typename T> |
| void KelvinVSel(bool scalar, bool strip_mine, Instruction *inst); |
| |
| template <typename T> |
| void KelvinVSel(bool scalar, bool strip_mine, Instruction *inst) { |
| // Select lanes from two operands with vector selection boolean. |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T, T)>( |
| [](T vd, T vs1, T vs2) -> T { return vs1 & 1 ? vd : vs2; })); |
| } |
| template void KelvinVSel<int8_t>(bool, bool, Instruction *); |
| template void KelvinVSel<int16_t>(bool, bool, Instruction *); |
| template void KelvinVSel<int32_t>(bool, bool, Instruction *); |
| |
| // Returns even elements of concatenated registers. |
| template <typename T> |
| T VEvnOpGetArg1(const Instruction *inst, bool scalar, int num_ops, int op_index, |
| int dst_element_index, int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| const int elts_per_register = vector_size_in_bytes / sizeof(T); |
| |
| auto src_element_index = |
| op_index * elts_per_register * 2 + dst_element_index * 2; |
| const int elts_per_src = elts_per_register * num_ops; |
| |
| if (src_element_index < elts_per_src) { |
| return GetInstructionSource<T>(inst, 0, src_element_index); |
| } |
| |
| return GetInstructionSource<T>(inst, 1, |
| scalar ? 0 : src_element_index - elts_per_src); |
| } |
| |
| template <typename T> |
| void KelvinVEvn(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(VEvnOpGetArg1<T>), |
| SourceArgGetter<T, T, T, T>(VEvnOpGetArg1<T>)); |
| } |
| template void KelvinVEvn<int8_t>(bool, bool, Instruction *); |
| template void KelvinVEvn<int16_t>(bool, bool, Instruction *); |
| template void KelvinVEvn<int32_t>(bool, bool, Instruction *); |
| |
| // Returns odd elements of concatenated registers. |
| template <typename T> |
| T VOddOpGetArg1(const Instruction *inst, bool scalar, int num_ops, int op_index, |
| int dst_element_index, int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| const int elts_per_register = vector_size_in_bytes / sizeof(T); |
| |
| auto src_element_index = |
| op_index * elts_per_register * 2 + dst_element_index * 2 + 1; |
| const int elts_per_src = elts_per_register * num_ops; |
| |
| if (src_element_index < elts_per_src) { |
| return GetInstructionSource<T>(inst, 0, src_element_index); |
| } |
| |
| return GetInstructionSource<T>(inst, 1, |
| scalar ? 0 : src_element_index - elts_per_src); |
| } |
| |
| template <typename T> |
| void KelvinVOdd(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, false /* widen_dst */, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(VOddOpGetArg1<T>), |
| SourceArgGetter<T, T, T, T>(VOddOpGetArg1<T>)); |
| } |
| template void KelvinVOdd<int8_t>(bool, bool, Instruction *); |
| template void KelvinVOdd<int16_t>(bool, bool, Instruction *); |
| template void KelvinVOdd<int32_t>(bool, bool, Instruction *); |
| |
| // Returns evn/odd elements of concatenated registers based on dst_reg_index. |
| template <typename T> |
| T VEvnoddOpGetArg1(const Instruction *inst, bool scalar, int num_ops, |
| int op_index, int dst_element_index, int dst_reg_index) { |
| return dst_reg_index == 0 |
| ? VEvnOpGetArg1<T>(inst, scalar, num_ops, op_index, |
| dst_element_index, dst_reg_index) |
| : VOddOpGetArg1<T>(inst, scalar, num_ops, op_index, |
| dst_element_index, dst_reg_index); |
| } |
| |
| template <typename T> |
| void KelvinVEvnodd(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, true /* widen_dst */, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(VEvnoddOpGetArg1<T>), |
| SourceArgGetter<T, T, T, T>(VEvnoddOpGetArg1<T>)); |
| } |
| template void KelvinVEvnodd<int8_t>(bool, bool, Instruction *); |
| template void KelvinVEvnodd<int16_t>(bool, bool, Instruction *); |
| template void KelvinVEvnodd<int32_t>(bool, bool, Instruction *); |
| |
| // Interleave even/odd lanes of two operands. |
| // Returns odd elements of concatenated registers. |
| template <typename T> |
| T VZipOpGetArg1(const Instruction *inst, bool scalar, int num_ops, int op_index, |
| int dst_element_index, int dst_reg_index) { |
| auto state = static_cast<KelvinState *>(inst->state()); |
| const int vector_size_in_bytes = state->vector_length() / 8; |
| const int elts_per_register = vector_size_in_bytes / sizeof(T); |
| |
| auto src_element_index = op_index * elts_per_register + |
| dst_element_index / 2 + |
| dst_reg_index * elts_per_register / 2; |
| |
| if (dst_element_index & 1) { |
| return GetInstructionSource<T>(inst, 1, scalar ? 0 : src_element_index); |
| } else { |
| return GetInstructionSource<T>(inst, 0, src_element_index); |
| } |
| } |
| |
| template <typename T> |
| void KelvinVZip(bool scalar, bool strip_mine, Instruction *inst) { |
| KelvinBinaryVectorOp<false /* halftype */, true /* widen_dst */, T, T, T>( |
| inst, scalar, strip_mine, |
| std::function<T(T, T)>([](T vs1, T vs2) -> T { return vs1; }), |
| SourceArgGetter<T, T, T, T>(VZipOpGetArg1<T>), |
| SourceArgGetter<T, T, T, T>(VZipOpGetArg1<T>)); |
| } |
| template void KelvinVZip<int8_t>(bool, bool, Instruction *); |
| template void KelvinVZip<int16_t>(bool, bool, Instruction *); |
| template void KelvinVZip<int32_t>(bool, bool, Instruction *); |
| } // namespace kelvin::sim |