blob: 0719491797fd4260d7c58fb99970787316a1e823 [file] [log] [blame]
// Copyright 2023 Google LLC
#ifndef TESTS_VERILATOR_SIM_KELVIN_VALU_H_
#define TESTS_VERILATOR_SIM_KELVIN_VALU_H_
#include "tests/verilator_sim/kelvin/alu_ref.h"
#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
#include "tests/verilator_sim/kelvin/vencodeop.h"
constexpr int kLanes = kVector / 32;
constexpr int kReadPorts = 7;
constexpr int kWritePorts = 4;
struct valu_t {
uint8_t op : 7;
uint8_t f2 : 3;
uint8_t sz : 3;
struct {
uint32_t data[kLanes];
} in[kReadPorts];
struct {
uint32_t data[kLanes];
} out[kWritePorts];
struct {
uint32_t data;
} sv;
// Tracking the read/write/scalar controls.
struct {
bool valid;
uint8_t addr : 6;
uint8_t tag : 1;
} r[kReadPorts];
struct {
bool valid;
uint8_t addr : 6;
} w[kWritePorts];
struct {
bool valid;
} scalar;
bool operator!=(const valu_t& rhs) const {
if (w[0].valid != rhs.w[0].valid) return true;
if (w[1].valid != rhs.w[1].valid) return true;
if (w[0].valid && w[0].addr != rhs.w[0].addr) return true;
if (w[1].valid && w[1].addr != rhs.w[1].addr) return true;
for (int i = 0; i < kLanes; ++i) {
if (w[0].valid && out[0].data[i] != rhs.out[0].data[i]) return true;
if (w[1].valid && out[1].data[i] != rhs.out[1].data[i]) return true;
}
return false;
}
void print(const char* name, const bool inputs = false) {
printf("[%s] op=%d f2=%d sz=%d valid=[%d,%d] waddr=%d", name, op, f2, sz,
w[0].valid, w[1].valid, w[0].valid ? w[0].addr : 0);
if (w[1].valid) {
printf(" {%d}", w[1].addr);
}
printf(" wdata =");
for (int i = 0; i < kLanes; ++i) {
printf(" %08x", w[0].valid ? out[0].data[i] : 0);
}
if (w[1].valid) {
printf(" : {");
for (int i = 0; i < kLanes; ++i) {
printf(" %08x", out[1].data[i]);
}
printf(" }");
}
printf("\n");
if (inputs) {
printf("\n");
for (int i = 0; i < kReadPorts; ++i) {
printf(" read%d =", i);
for (int j = 0; j < kLanes; ++j) {
printf(" %08x", in[i].data[j]);
}
printf("\n");
}
}
}
};
#define VOP1U(func) \
if (sz == 1) { \
v = 1; \
x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 | \
func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
} \
if (sz == 2) { \
v = 1; \
x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16; \
} \
if (sz == 4) { \
v = 1; \
x = func(uint32_t(a)); \
}
#define VOP1PU(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 | \
func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
y = func(uint8_t(c)) | func(uint8_t(c >> 8)) << 8 | \
func(uint8_t(c >> 16)) << 16 | func(uint8_t(c >> 24)) << 24; \
} \
if (sz == 2) { \
v = 1; \
w = 1; \
x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16; \
y = func(uint16_t(c)) | func(uint16_t(c >> 16)) << 16; \
} \
if (sz == 4) { \
v = 1; \
w = 1; \
x = func(uint32_t(a)); \
y = func(uint32_t(c)); \
}
#define VOPXU(func) \
if (sz == 1) { \
v = 1; \
x = func(uint8_t(b)) | func(uint8_t(b >> 8)) << 8 | \
func(uint8_t(b >> 16)) << 16 | func(uint8_t(b >> 24)) << 24; \
} \
if (sz == 2) { \
v = 1; \
x = func(uint16_t(b)) | func(uint16_t(b >> 16)) << 16; \
} \
if (sz == 4) { \
v = 1; \
x = func(uint32_t(b)); \
}
#define VOP2S(func) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int8_t(a), int8_t(b))) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int16_t(a), int16_t(b))) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(int32_t(a), int32_t(b))); \
}
#define VOP2U(func) \
if (sz == 1) { \
v = 1; \
x = func(uint8_t(a), uint8_t(b)) | \
func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 | \
func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 | \
func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = func(uint16_t(a), uint16_t(b)) | \
func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = func(uint32_t(a), uint32_t(b)); \
}
#define VOP2(func) \
if (f2_signed) { \
VOP2S(func) \
} else { \
VOP2U(func) \
}
#define VOP2S_R(func, r) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int8_t(a), int8_t(b), r)) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int16_t(a), int16_t(b), r)) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(int32_t(a), int32_t(b), r)); \
}
#define VOP2U_R(func, r) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(uint8_t(a), uint8_t(b), r)) | \
uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r)) << 8 | \
uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r)) << 16 | \
uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(uint16_t(a), uint16_t(b), r)) | \
uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(uint32_t(a), uint32_t(b), r)); \
}
#define VOP2_R(func, r) \
if (f2_signed) { \
VOP2S_R(func, r) \
} else { \
VOP2U_R(func, r) \
}
#define VOP2PS(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = uint8_t(func(int8_t(a), int8_t(b))) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24; \
y = uint8_t(func(int8_t(c), int8_t(b))) | \
uint8_t(func(int8_t(c >> 8), int8_t(b >> 8))) << 8 | \
uint8_t(func(int8_t(c >> 16), int8_t(b >> 16))) << 16 | \
uint8_t(func(int8_t(c >> 24), int8_t(b >> 24))) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = uint16_t(func(int16_t(a), int16_t(b))) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
y = uint16_t(func(int16_t(c), int16_t(b))) | \
uint16_t(func(int16_t(c >> 16), int16_t(b >> 16))) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = uint32_t(func(int32_t(a), int32_t(b))); \
y = uint32_t(func(int32_t(c), int32_t(b))); \
}
#define VOP2PU(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = func(uint8_t(a), uint8_t(b)) | \
func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 | \
func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 | \
func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24; \
y = func(uint8_t(c), uint8_t(b)) | \
func(uint8_t(c >> 8), uint8_t(b >> 8)) << 8 | \
func(uint8_t(c >> 16), uint8_t(b >> 16)) << 16 | \
func(uint8_t(c >> 24), uint8_t(b >> 24)) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = func(uint16_t(a), uint16_t(b)) | \
func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
y = func(uint16_t(c), uint16_t(b)) | \
func(uint16_t(c >> 16), uint16_t(b >> 16)) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = func(uint32_t(a), uint32_t(b)); \
y = func(uint32_t(c), uint32_t(b)); \
}
#define VOP2P(func) \
if (f2_signed) { \
VOP2PS(func) \
} else { \
VOP2PU(func) \
}
#define VOP2PS_R(func, r) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = uint8_t(func(int8_t(a), int8_t(b), r)) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r)) << 24; \
y = uint8_t(func(int8_t(c), int8_t(b), r)) | \
uint8_t(func(int8_t(c >> 8), int8_t(b >> 8), r)) << 8 | \
uint8_t(func(int8_t(c >> 16), int8_t(b >> 16), r)) << 16 | \
uint8_t(func(int8_t(c >> 24), int8_t(b >> 24), r)) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = uint16_t(func(int16_t(a), int16_t(b), r)) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r)) << 16; \
y = uint16_t(func(int16_t(c), int16_t(b), r)) | \
uint16_t(func(int16_t(c >> 16), int16_t(b >> 16), r)) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = uint32_t(func(int32_t(a), int32_t(b), r)); \
y = uint32_t(func(int32_t(c), int32_t(b), r)); \
}
#define VOP2PU_R(func, r) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = uint8_t(func(uint8_t(a), uint8_t(b), r)) | \
uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r)) << 8 | \
uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r)) << 16 | \
uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r)) << 24; \
y = uint8_t(func(uint8_t(c), uint8_t(b), r)) | \
uint8_t(func(uint8_t(c >> 8), uint8_t(b >> 8), r)) << 8 | \
uint8_t(func(uint8_t(c >> 16), uint8_t(b >> 16), r)) << 16 | \
uint8_t(func(uint8_t(c >> 24), uint8_t(b >> 24), r)) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = uint16_t(func(uint16_t(a), uint16_t(b), r)) | \
uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r)) << 16; \
y = uint16_t(func(uint16_t(c), uint16_t(b), r)) | \
uint16_t(func(uint16_t(c >> 16), uint16_t(b >> 16), r)) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = uint32_t(func(uint32_t(a), uint32_t(b), r)); \
y = uint32_t(func(uint32_t(c), uint32_t(b), r)); \
}
#define VOP2P_R(func, r) \
if (f2_signed) { \
VOP2PS_R(func, r) \
} else { \
VOP2PU_R(func, r) \
}
#define VOP2S_R_X(func, r, s) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int8_t(a), int8_t(b), r, s)) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r, s)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int16_t(a), int16_t(b), r, s)) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r, s)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(int32_t(a), int32_t(b), r, s)); \
}
#define VOP2U_R_X(func, r, s) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(uint8_t(a), uint8_t(b), r, s)) | \
uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r, s)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(uint16_t(a), uint16_t(b), r, s)) | \
uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r, s)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(uint32_t(a), uint32_t(b), r, s)); \
}
#define VOP2_R_X(func, r, s) \
if (f2_signed) { \
VOP2S_R_X(func, r, s) \
} else { \
VOP2U_R_X(func, r, s) \
}
#define VOP2PS_R_X(func, r, s) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = uint8_t(func(int8_t(a), int8_t(b), r, s)) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r, s)) << 24; \
y = uint8_t(func(int8_t(c), int8_t(b), r, s)) | \
uint8_t(func(int8_t(c >> 8), int8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(int8_t(c >> 16), int8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(int8_t(c >> 24), int8_t(b >> 24), r, s)) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = uint16_t(func(int16_t(a), int16_t(b), r, s)) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r, s)) << 16; \
y = uint16_t(func(int16_t(c), int16_t(b), r, s)) | \
uint16_t(func(int16_t(c >> 16), int16_t(b >> 16), r, s)) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = uint32_t(func(int32_t(a), int32_t(b), r, s)); \
y = uint32_t(func(int32_t(c), int32_t(b), r, s)); \
}
#define VOP2PU_R_X(func, r, s) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = uint8_t(func(uint8_t(a), uint8_t(b), r, s)) | \
uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r, s)) << 24; \
y = uint8_t(func(uint8_t(c), uint8_t(b), r, s)) | \
uint8_t(func(uint8_t(c >> 8), uint8_t(b >> 8), r, s)) << 8 | \
uint8_t(func(uint8_t(c >> 16), uint8_t(b >> 16), r, s)) << 16 | \
uint8_t(func(uint8_t(c >> 24), uint8_t(b >> 24), r, s)) << 24; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
x = uint16_t(func(uint16_t(a), uint16_t(b), r, s)) | \
uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r, s)) << 16; \
y = uint16_t(func(uint16_t(c), uint16_t(b), r, s)) | \
uint16_t(func(uint16_t(c >> 16), uint16_t(b >> 16), r, s)) << 16; \
} else if (sz == 4) { \
v = 1; \
w = 1; \
x = uint32_t(func(uint32_t(a), uint32_t(b), r, s)); \
y = uint32_t(func(uint32_t(c), uint32_t(b), r, s)); \
}
#define VOP2P_R_X(func, r, s) \
if (f2_signed) { \
VOP2PS_R_X(func, r, s) \
} else { \
VOP2PU_R_X(func, r, s) \
}
#define VOP2W(func) \
if (sz == 1) { \
v = 1; \
x = 0; \
} else if (sz == 2) { \
v = 1; \
x = 0; \
} else if (sz == 4) { \
v = 1; \
x = func(uint32_t(a), uint32_t(b)); \
}
#define VOP3S(func) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int8_t(a), int8_t(b), int8_t(c))) | \
uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), int8_t(c >> 8))) << 8 | \
uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), int8_t(c >> 16))) \
<< 16 | \
uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), int8_t(c >> 24))) \
<< 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int16_t(a), int16_t(b), int16_t(c))) | \
uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), int16_t(c >> 16))) \
<< 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(int32_t(a), int32_t(b), int32_t(c))); \
}
#define VOP3(func) \
if (f2_signed) { \
VOP3S(func) \
} else { \
VOP3U(func) \
}
#define VOP3U(func) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(uint8_t(a), uint8_t(b), uint8_t(c))) | \
uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), uint8_t(c >> 8))) \
<< 8 | \
uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), uint8_t(c >> 16))) \
<< 16 | \
uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), uint8_t(c >> 24))) \
<< 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(uint16_t(a), uint16_t(b), uint16_t(c))) | \
uint16_t( \
func(uint16_t(a >> 16), uint16_t(b >> 16), uint16_t(c >> 16))) \
<< 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(uint32_t(a), uint32_t(b), uint32_t(c))); \
}
#define VOP3NS_R_U(func, r, u) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int16_t(a), int8_t(b), r, u)) | \
uint8_t(func(int16_t(c), int8_t(b >> 8), r, u)) << 8 | \
uint8_t(func(int16_t(a >> 16), int8_t(b >> 16), r, u)) << 16 | \
uint8_t(func(int16_t(c >> 16), int8_t(b >> 24), r, u)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int32_t(a), int16_t(b), r, u)) | \
uint16_t(func(int32_t(c), int16_t(b >> 16), r, u)) << 16; \
} else if (sz == 4) { \
v = 1; \
x = 0; \
}
#define VOP3QS_R_U(func, r, u) \
if (sz == 1) { \
v = 1; \
x = uint8_t(func(int32_t(a), int8_t(b), r, u)) | \
uint8_t(func(int32_t(d), int8_t(b >> 8), r, u)) << 8 | \
uint8_t(func(int32_t(c), int8_t(b >> 16), r, u)) << 16 | \
uint8_t(func(int32_t(f), int8_t(b >> 24), r, u)) << 24; \
} else if (sz == 2) { \
v = 1; \
x = 0; \
} else if (sz == 4) { \
v = 1; \
x = 0; \
}
#define VOP2M(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
auto p0 = func(uint8_t(a), uint8_t(b)); \
auto p1 = func(uint8_t(a >> 8), uint8_t(b >> 8)); \
auto p2 = func(uint8_t(a >> 16), uint8_t(b >> 16)); \
auto p3 = func(uint8_t(a >> 24), uint8_t(b >> 24)); \
x = p0.first | (p1.first << 8) | (p2.first << 16) | (p3.first << 24); \
y = p0.second | (p1.second << 8) | (p2.second << 16) | (p3.second << 24); \
} else if (sz == 2) { \
v = 1; \
w = 1; \
auto p0 = func(uint16_t(a), uint16_t(b)); \
auto p1 = func(uint16_t(a >> 16), uint16_t(b >> 16)); \
x = p0.first | (p1.first << 16); \
y = p0.second | (p1.second << 16); \
} else if (sz == 4) { \
v = 1; \
w = 1; \
auto p = func(uint32_t(a), uint32_t(b)); \
x = p.first; \
y = p.second; \
}
#define VOPPS(func) \
if (sz == 1) { \
v = 1; \
x = 0; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(int8_t(a), int8_t(a >> 8))) | \
uint16_t(func(int8_t(a >> 16), int8_t(a >> 24))) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(int16_t(a), int16_t(a >> 16))); \
}
#define VOPPU(func) \
if (sz == 1) { \
v = 1; \
x = 0; \
} else if (sz == 2) { \
v = 1; \
x = uint16_t(func(uint8_t(a), uint8_t(a >> 8))) | \
uint16_t(func(uint8_t(a >> 16), uint8_t(a >> 24))) << 16; \
} else if (sz == 4) { \
v = 1; \
x = uint32_t(func(uint16_t(a), uint16_t(a >> 16))); \
}
#define VOPP(func) \
if (f2_signed) { \
VOPPS(func) \
} else { \
VOPPU(func) \
}
#define WOP2U(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = 0; \
y = 0; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
uint16_t p0 = func(uint8_t(a), uint8_t(b)); \
uint16_t p1 = func(uint8_t(a >> 8), uint8_t(b >> 8)); \
uint16_t p2 = func(uint8_t(a >> 16), uint8_t(b >> 16)); \
uint16_t p3 = func(uint8_t(a >> 24), uint8_t(b >> 24)); \
x = p0 | (p2 << 16); \
y = p1 | (p3 << 16); \
} else if (sz == 4) { \
v = 1; \
w = 1; \
uint32_t p0 = func(uint16_t(a), uint16_t(b)); \
uint32_t p1 = func(uint16_t(a >> 16), uint16_t(b >> 16)); \
x = p0; \
y = p1; \
}
#define WOP2S(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = 0; \
y = 0; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
uint16_t p0 = func(int8_t(a), int8_t(b)); \
uint16_t p1 = func(int8_t(a >> 8), int8_t(b >> 8)); \
uint16_t p2 = func(int8_t(a >> 16), int8_t(b >> 16)); \
uint16_t p3 = func(int8_t(a >> 24), int8_t(b >> 24)); \
x = p0 | (p2 << 16); \
y = p1 | (p3 << 16); \
} else if (sz == 4) { \
v = 1; \
w = 1; \
uint32_t p0 = func(int16_t(a), int16_t(b)); \
uint32_t p1 = func(int16_t(a >> 16), int16_t(b >> 16)); \
x = p0; \
y = p1; \
}
#define WOP2(func) \
if (f2_signed) { \
WOP2S(func) \
} else { \
WOP2U(func) \
}
#define WOPAU(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = 0; \
y = 0; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
uint16_t p0 = func(uint16_t(a), uint8_t(b)); \
uint16_t p1 = func(uint16_t(c), uint8_t(b >> 8)); \
uint16_t p2 = func(uint16_t(a >> 16), uint8_t(b >> 16)); \
uint16_t p3 = func(uint16_t(c >> 16), uint8_t(b >> 24)); \
x = p0 | (p2 << 16); \
y = p1 | (p3 << 16); \
} else if (sz == 4) { \
v = 1; \
w = 1; \
uint32_t p0 = func(uint32_t(a), uint16_t(b)); \
uint32_t p1 = func(uint32_t(c), uint16_t(b >> 16)); \
x = p0; \
y = p1; \
}
#define WOPAS(func) \
if (sz == 1) { \
v = 1; \
w = 1; \
x = 0; \
y = 0; \
} else if (sz == 2) { \
v = 1; \
w = 1; \
uint16_t p0 = func(int16_t(a), int8_t(b)); \
uint16_t p1 = func(int16_t(c), int8_t(b >> 8)); \
uint16_t p2 = func(int16_t(a >> 16), int8_t(b >> 16)); \
uint16_t p3 = func(int16_t(c >> 16), int8_t(b >> 24)); \
x = p0 | (p2 << 16); \
y = p1 | (p3 << 16); \
} else if (sz == 4) { \
v = 1; \
w = 1; \
uint32_t p0 = func(int32_t(a), int16_t(b)); \
uint32_t p1 = func(int32_t(c), int16_t(b >> 16)); \
x = p0; \
y = p1; \
}
#define WOPA(func) \
if (f2_signed) { \
WOPAS(func) \
} else { \
WOPAU(func) \
}
template <typename T>
void VSlidevn(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const int shfamt = (op.f2 & 3) + 1;
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
T* out = reinterpret_cast<T*>(op.out[0].data);
for (int i = 0; i < n; ++i) {
out[i] = i + shfamt < n ? in0[i + shfamt] : in1[i + shfamt - n];
}
op.w[0].valid = true;
}
template <typename T>
void VSlidevp(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const int shfamt = (op.f2 & 3) + 1;
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
T* out = reinterpret_cast<T*>(op.out[0].data);
for (int i = 0; i < n; ++i) {
out[i] = i - shfamt < 0 ? in0[n - shfamt + i] : in1[i - shfamt];
}
op.w[0].valid = true;
}
template <typename T>
void VSlidehn2(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const int shfamt = (op.f2 & 3) + 1;
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
T* out0 = reinterpret_cast<T*>(op.out[0].data);
T* out1 = reinterpret_cast<T*>(op.out[1].data);
for (int i = 0; i < n; ++i) {
out0[i] = i + shfamt < n ? in0[i + shfamt] : in1[i + shfamt - n];
}
for (int i = 0; i < n; ++i) {
out1[i] = i + shfamt < n ? in1[i + shfamt] : in2[i + shfamt - n];
}
op.w[0].valid = true;
op.w[1].valid = true;
}
template <typename T>
void VSlidehp2(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const int shfamt = (op.f2 & 3) + 1;
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
T* out0 = reinterpret_cast<T*>(op.out[0].data);
T* out1 = reinterpret_cast<T*>(op.out[1].data);
for (int i = 0; i < n; ++i) {
out0[i] = i - shfamt < 0 ? in0[n - shfamt + i] : in1[i - shfamt];
}
for (int i = 0; i < n; ++i) {
out1[i] = i - shfamt < 0 ? in1[n - shfamt + i] : in2[i - shfamt];
}
op.w[0].valid = true;
op.w[1].valid = true;
}
static void VSlidevn(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VSlidevn<uint8_t>(op); break;
case 2: VSlidevn<uint16_t>(op); break;
case 4: VSlidevn<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VSlidevp(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VSlidevp<uint8_t>(op); break;
case 2: VSlidevp<uint16_t>(op); break;
case 4: VSlidevp<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VSlidehn2(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VSlidehn2<uint8_t>(op); break;
case 2: VSlidehn2<uint16_t>(op); break;
case 4: VSlidehn2<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VSlidehp2(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VSlidehp2<uint8_t>(op); break;
case 2: VSlidehp2<uint16_t>(op); break;
case 4: VSlidehp2<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
template <typename T>
void VSel(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
T* out = reinterpret_cast<T*>(op.out[0].data);
for (int i = 0; i < n; ++i) {
out[i] = in0[i] & 1 ? in2[i] : in1[i];
}
op.w[0].valid = true;
}
static void VSel(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VSel<uint8_t>(op); break;
case 2: VSel<uint16_t>(op); break;
case 4: VSel<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
template <typename T>
void VEvn(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
T* out0 = reinterpret_cast<T*>(op.out[0].data);
for (int i = 0; i < n; ++i) {
out0[i] = i < n / 2 ? in0[2 * i + 0] : in1[2 * (i - n / 2) + 0];
}
op.w[0].valid = true;
}
template <typename T>
void VOdd(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
T* out1 = reinterpret_cast<T*>(op.out[1].data);
for (int i = 0; i < n; ++i) {
out1[i] = i < n / 2 ? in0[2 * i + 1] : in1[2 * (i - n / 2) + 1];
}
op.w[1].valid = true;
}
template <typename T>
void VEvnOdd(valu_t& op) {
VEvn<T>(op);
VOdd<T>(op);
}
static void VEvn(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VEvn<uint8_t>(op); break;
case 2: VEvn<uint16_t>(op); break;
case 4: VEvn<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VOdd(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VOdd<uint8_t>(op); break;
case 2: VOdd<uint16_t>(op); break;
case 4: VOdd<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VEvnOdd(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VEvnOdd<uint8_t>(op); break;
case 2: VEvnOdd<uint16_t>(op); break;
case 4: VEvnOdd<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
template <typename T>
void VZip(valu_t& op) {
constexpr int n = kLanes * 4 / sizeof(T);
constexpr int h = n / 2;
const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
T* out0 = reinterpret_cast<T*>(op.out[0].data);
T* out1 = reinterpret_cast<T*>(op.out[1].data);
for (int i = 0; i < n; ++i) {
const int j = i / 2;
out0[i] = i & 1 ? in1[j + 0] : in0[j + 0];
out1[i] = i & 1 ? in1[j + h] : in0[j + h];
}
op.w[0].valid = true;
op.w[1].valid = true;
}
static void VZip(valu_t& op) {
// clang-format off
switch (op.sz) {
case 1: VZip<uint8_t>(op); break;
case 2: VZip<uint16_t>(op); break;
case 4: VZip<uint32_t>(op); break;
default: assert(false); break;
}
// clang-format on
}
static void VDwconv(const uint32_t adata[6], const uint32_t bdata[6],
const uint32_t abias, const uint32_t bbias,
const bool asign, const bool bsign, uint32_t out[4]) {
int32_t s_abias = int32_t(abias << 23) >> 23;
int32_t s_bbias = int32_t(bbias << 23) >> 23;
for (int i = 0; i < 4; ++i) {
uint32_t accum = 0;
for (int j = 0; j < 3; ++j) {
int32_t s_adata = int32_t(uint8_t(adata[j] >> (8 * i)));
int32_t s_bdata = int32_t(uint8_t(bdata[j] >> (8 * i)));
if (asign) {
s_adata = int8_t(s_adata);
}
if (bsign) {
s_bdata = int8_t(s_bdata);
}
accum += (s_adata + s_abias) * (s_bdata + s_bbias);
}
out[i] = accum;
}
}
static void VDwconv(valu_t& op) {
const uint32_t* in0 = reinterpret_cast<const uint32_t*>(op.in[0].data);
const uint32_t* in1 = reinterpret_cast<const uint32_t*>(op.in[1].data);
const uint32_t* in2 = reinterpret_cast<const uint32_t*>(op.in[2].data);
const uint32_t* in3 = reinterpret_cast<const uint32_t*>(op.in[3].data);
const uint32_t* in4 = reinterpret_cast<const uint32_t*>(op.in[4].data);
const uint32_t* in5 = reinterpret_cast<const uint32_t*>(op.in[5].data);
uint32_t* out0 = reinterpret_cast<uint32_t*>(op.out[0].data);
uint32_t* out1 = reinterpret_cast<uint32_t*>(op.out[1].data);
uint32_t* out2 = reinterpret_cast<uint32_t*>(op.out[2].data);
uint32_t* out3 = reinterpret_cast<uint32_t*>(op.out[3].data);
struct vdwconv_u8_t {
uint32_t mode : 2; // 31:30
uint32_t sparsity : 2; // 29:28
uint32_t regbase : 4; // 27:24
uint32_t rsvd : 4; // 23:20
uint32_t abias : 9; // 19:11
uint32_t asign : 1; // 10
uint32_t bbias : 9; // 9:1
uint32_t bsign : 1; // 0
} cmd;
uint32_t* p_cmd = reinterpret_cast<uint32_t*>(&cmd);
*p_cmd = op.sv.data;
assert(cmd.mode == 0);
assert(cmd.rsvd == 0);
assert(cmd.sparsity < 3);
const uint32_t abias = cmd.abias;
const uint32_t bbias = cmd.bbias;
const bool asign = cmd.asign;
const bool bsign = cmd.bsign;
constexpr int n = kVector / 32;
constexpr int kSparseSize = n + 2;
uint32_t sparse[kSparseSize];
if (cmd.sparsity == 1) {
sparse[0] = in0[n - 1];
for (int i = 0; i < kVector / 32; ++i) {
sparse[i + 1] = in1[i];
}
sparse[n + 1] = in2[0];
} else if (cmd.sparsity == 2) {
for (int i = 0; i < kVector / 32; ++i) {
sparse[i] = in0[i];
}
sparse[n + 0] = in1[0];
sparse[n + 1] = in1[1];
}
for (int i = 0; i < kVector / 32; ++i) {
uint32_t adata[3];
adata[0] = !cmd.sparsity ? in0[i] : sparse[i + 0];
adata[1] = !cmd.sparsity ? in1[i] : sparse[i + 1];
adata[2] = !cmd.sparsity ? in2[i] : sparse[i + 2];
uint32_t bdata[3] = {in3[i], in4[i], in5[i]};
uint32_t out[4];
VDwconv(adata, bdata, abias, bbias, asign, bsign, out);
// Note the output interleaving.
out0[i] = out[0];
out2[i] = out[1];
out1[i] = out[2];
out3[i] = out[3];
}
op.w[0].valid = true;
op.w[1].valid = true;
op.w[2].valid = true;
op.w[3].valid = true;
}
static void VAlu(valu_t& op) {
// clang-format off
switch (op.op) {
case encode::vslidevn: VSlidevn(op); return;
case encode::vslidevp: VSlidevp(op); return;
case encode::vslidehn: VSlidevn(op); return;
case encode::vslidehp: VSlidevp(op); return;
case encode::vslidehn2: VSlidehn2(op); return;
case encode::vslidehp2: VSlidehp2(op); return;
case encode::vsel: VSel(op); return;
case encode::vevn: VEvn(op); return;
case encode::vodd: VOdd(op); return;
case encode::vevnodd: VEvnOdd(op); return;
case encode::vzip: VZip(op); return;
case encode::vdwconv: VDwconv(op); return;
}
// clang-format on
for (int i = 0; i < kLanes; ++i) {
const uint8_t f2 = op.f2;
const uint8_t sz = op.sz;
const uint32_t a = op.in[0].data[i];
const uint32_t b = op.in[1].data[i];
const uint32_t c = op.in[2].data[i];
const uint32_t d = op.in[3].data[i];
const uint32_t f = op.in[5].data[i];
bool v = false;
bool w = false;
uint32_t x = 0;
uint32_t y = 0;
const bool f2_negative = ((f2 >> 0) & 1) && (op.op == encode::vdmulh ||
op.op == encode::vdmulh2);
const bool f2_round = (f2 >> 1) & 1;
const bool f2_signed =
!((f2 >> 0) & 1) || op.op == encode::vdmulh || op.op == encode::vdmulh2;
// clang-format off
switch (op.op) {
case encode::vdup: VOPXU(dup); break;
case encode::vadd: VOP2U(add); break;
case encode::vsub: VOP2U(sub); break;
case encode::vrsub: VOP2U(rsub); break;
case encode::veq: VOP2U(cmp_eq); break;
case encode::vne: VOP2U(cmp_ne); break;
case encode::vlt: VOP2(cmp_lt); break;
case encode::vle: VOP2(cmp_le); break;
case encode::vgt: VOP2(cmp_gt); break;
case encode::vge: VOP2(cmp_ge); break;
case encode::vabsd: VOP2(absd); break;
case encode::vmax: VOP2(max); break;
case encode::vmin: VOP2(min); break;
case encode::vadd3: VOP3U(add3); break;
case encode::vand: VOP2U(log_and); break;
case encode::vor: VOP2U(log_or); break;
case encode::vxor: VOP2U(log_xor); break;
case encode::vnot: VOP1U(log_not); break;
case encode::vrev: VOP2U(log_rev); break;
case encode::vror: VOP2U(log_ror); break;
case encode::vclb: VOP1U(log_clb); break;
case encode::vclz: VOP1U(log_clz); break;
case encode::vcpop: VOP1U(log_cpop); break;
case encode::vmv: VOP1U(mv); break;
case encode::vmv2: VOP1PU(mv); break;
case encode::vmvp: VOP2M(mvp); break;
case encode::vshl: VOP2U(shl); break;
case encode::vshr: VOP2(shr); break;
case encode::vshf: VOP2_R(shf, f2_round); break;
case encode::vsrans: VOP3NS_R_U(srans, f2_round, !f2_signed); break;
case encode::vsraqs: VOP3QS_R_U(srans, f2_round, !f2_signed); break;
case encode::vmul: VOP2S(mul); break;
case encode::vmul2: VOP2PS(mul); break;
case encode::vmuls: VOP2(muls); break;
case encode::vmuls2: VOP2P(muls); break;
case encode::vmulw: WOP2(mulw); break;
case encode::vmulh: VOP2_R(mulh, f2_round); break;
case encode::vmulh2: VOP2P_R(mulh, f2_round); break;
case encode::vdmulh: VOP2_R_X(dmulh, f2_round, f2_negative); break;
case encode::vdmulh2: VOP2P_R_X(dmulh, f2_round, f2_negative); break;
case encode::vmadd: VOP3(madd); break;
case encode::vadds: VOP2(adds); break;
case encode::vsubs: VOP2(subs); break;
case encode::vaddw: WOP2(addw); break;
case encode::vsubw: WOP2(subw); break;
case encode::vacc: WOPA(acc); break;
case encode::vpadd: VOPP(padd); break;
case encode::vpsub: VOPP(psub); break;
case encode::vhadd: VOP2_R(hadd, f2_round); break;
case encode::vhsub: VOP2_R(hsub, f2_round); break;
}
// clang-format on
op.w[0].valid = v;
op.w[1].valid = w;
op.out[0].data[i] = x;
op.out[1].data[i] = y;
}
}
#endif // TESTS_VERILATOR_SIM_KELVIN_VALU_H_