| // Copyright lowRISC contributors. |
| // Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md. |
| // Licensed under the Apache License, Version 2.0, see LICENSE for details. |
| // SPDX-License-Identifier: Apache-2.0 |
| |
| /** |
| * Arithmetic logic unit |
| */ |
| module ibex_alu #( |
| parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone |
| ) ( |
| input ibex_pkg::alu_op_e operator_i, |
| input logic [31:0] operand_a_i, |
| input logic [31:0] operand_b_i, |
| |
| input logic instr_first_cycle_i, |
| |
| input logic [32:0] multdiv_operand_a_i, |
| input logic [32:0] multdiv_operand_b_i, |
| |
| input logic multdiv_sel_i, |
| |
| input logic [31:0] imd_val_q_i[2], |
| output logic [31:0] imd_val_d_o[2], |
| output logic [1:0] imd_val_we_o, |
| |
| output logic [31:0] adder_result_o, |
| output logic [33:0] adder_result_ext_o, |
| |
| output logic [31:0] result_o, |
| output logic comparison_result_o, |
| output logic is_equal_result_o |
| ); |
| import ibex_pkg::*; |
| |
| logic [31:0] operand_a_rev; |
| logic [32:0] operand_b_neg; |
| |
| // bit reverse operand_a for left shifts and bit counting |
| for (genvar k = 0; k < 32; k++) begin : gen_rev_operand_a |
| assign operand_a_rev[k] = operand_a_i[31-k]; |
| end |
| |
| /////////// |
| // Adder // |
| /////////// |
| |
| logic adder_op_a_shift1; |
| logic adder_op_a_shift2; |
| logic adder_op_a_shift3; |
| logic adder_op_b_negate; |
| logic [32:0] adder_in_a, adder_in_b; |
| logic [31:0] adder_result; |
| |
| always_comb begin |
| adder_op_a_shift1 = 1'b0; |
| adder_op_a_shift2 = 1'b0; |
| adder_op_a_shift3 = 1'b0; |
| adder_op_b_negate = 1'b0; |
| unique case (operator_i) |
| // Adder OPs |
| ALU_SUB, |
| |
| // Comparator OPs |
| ALU_EQ, ALU_NE, |
| ALU_GE, ALU_GEU, |
| ALU_LT, ALU_LTU, |
| ALU_SLT, ALU_SLTU, |
| |
| // MinMax OPs (RV32B Ops) |
| ALU_MIN, ALU_MINU, |
| ALU_MAX, ALU_MAXU: adder_op_b_negate = 1'b1; |
| |
| // Address Calculation OPs (RV32B Ops) |
| ALU_SH1ADD: if (RV32B != RV32BNone) adder_op_a_shift1 = 1'b1; |
| ALU_SH2ADD: if (RV32B != RV32BNone) adder_op_a_shift2 = 1'b1; |
| ALU_SH3ADD: if (RV32B != RV32BNone) adder_op_a_shift3 = 1'b1; |
| |
| default:; |
| endcase |
| end |
| |
| // prepare operand a |
| always_comb begin |
| unique case (1'b1) |
| multdiv_sel_i: adder_in_a = multdiv_operand_a_i; |
| adder_op_a_shift1: adder_in_a = {operand_a_i[30:0],2'b01}; |
| adder_op_a_shift2: adder_in_a = {operand_a_i[29:0],3'b001}; |
| adder_op_a_shift3: adder_in_a = {operand_a_i[28:0],4'b0001}; |
| default: adder_in_a = {operand_a_i,1'b1}; |
| endcase |
| end |
| |
| // prepare operand b |
| assign operand_b_neg = {operand_b_i,1'b0} ^ {33{1'b1}}; |
| always_comb begin |
| unique case (1'b1) |
| multdiv_sel_i: adder_in_b = multdiv_operand_b_i; |
| adder_op_b_negate: adder_in_b = operand_b_neg; |
| default: adder_in_b = {operand_b_i, 1'b0}; |
| endcase |
| end |
| |
| // actual adder |
| assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b); |
| |
| assign adder_result = adder_result_ext_o[32:1]; |
| |
| assign adder_result_o = adder_result; |
| |
| //////////////// |
| // Comparison // |
| //////////////// |
| |
| logic is_equal; |
| logic is_greater_equal; // handles both signed and unsigned forms |
| logic cmp_signed; |
| |
| always_comb begin |
| unique case (operator_i) |
| ALU_GE, |
| ALU_LT, |
| ALU_SLT, |
| // RV32B only |
| ALU_MIN, |
| ALU_MAX: cmp_signed = 1'b1; |
| |
| default: cmp_signed = 1'b0; |
| endcase |
| end |
| |
| assign is_equal = (adder_result == 32'b0); |
| assign is_equal_result_o = is_equal; |
| |
| // Is greater equal |
| always_comb begin |
| if ((operand_a_i[31] ^ operand_b_i[31]) == 1'b0) begin |
| is_greater_equal = (adder_result[31] == 1'b0); |
| end else begin |
| is_greater_equal = operand_a_i[31] ^ (cmp_signed); |
| end |
| end |
| |
| // GTE unsigned: |
| // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0 |
| // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0 |
| // (a[31] == 1 && b[31] == 0) => 1 |
| // (a[31] == 0 && b[31] == 1) => 0 |
| |
| // GTE signed: |
| // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0 |
| // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0 |
| // (a[31] == 1 && b[31] == 0) => 0 |
| // (a[31] == 0 && b[31] == 1) => 1 |
| |
| // generate comparison result |
| logic cmp_result; |
| |
| always_comb begin |
| unique case (operator_i) |
| ALU_EQ: cmp_result = is_equal; |
| ALU_NE: cmp_result = ~is_equal; |
| ALU_GE, ALU_GEU, |
| ALU_MAX, ALU_MAXU: cmp_result = is_greater_equal; // RV32B only |
| ALU_LT, ALU_LTU, |
| ALU_MIN, ALU_MINU, //RV32B only |
| ALU_SLT, ALU_SLTU: cmp_result = ~is_greater_equal; |
| |
| default: cmp_result = is_equal; |
| endcase |
| end |
| |
| assign comparison_result_o = cmp_result; |
| |
| /////////// |
| // Shift // |
| /////////// |
| |
| // The shifter structure consists of a 33-bit shifter: 32-bit operand + 1 bit extension for |
| // arithmetic shifts and one-shift support. |
| // Rotations and funnel shifts are implemented as multi-cycle instructions. |
| // The shifter is also used for single-bit instructions and bit-field place as detailed below. |
| // |
| // Standard Shifts |
| // =============== |
| // For standard shift instructions, the direction of the shift is to the right by default. For |
| // left shifts, the signal shift_left signal is set. If so, the operand is initially reversed, |
| // shifted to the right by the specified amount and shifted back again. For arithmetic- and |
| // one-shifts the 33rd bit of the shifter operand can is set accordingly. |
| // |
| // Multicycle Shifts |
| // ================= |
| // |
| // Rotation |
| // -------- |
| // For rotations, the operand signals operand_a_i and operand_b_i are kept constant to rs1 and |
| // rs2 respectively. |
| // |
| // Rotation pseudocode: |
| // shift_amt = rs2 & 31; |
| // multicycle_result = (rs1 >> shift_amt) | (rs1 << (32 - shift_amt)); |
| // ^-- cycle 0 -----^ ^-- cycle 1 --------------^ |
| // |
| // Funnel Shifts |
| // ------------- |
| // For funnel shifs, operand_a_i is tied to rs1 in the first cycle and rs3 in the |
| // second cycle. operand_b_i is always tied to rs2. The order of applying the shift amount or |
| // its complement is determined by bit [5] of shift_amt. |
| // |
| // Funnel shift Pseudocode: (fsl) |
| // shift_amt = rs2 & 63; |
| // shift_amt_compl = 32 - shift_amt[4:0] |
| // if (shift_amt >=33): |
| // multicycle_result = (rs1 >> shift_amt_compl[4:0]) | (rs3 << shift_amt[4:0]); |
| // ^-- cycle 0 ----------------^ ^-- cycle 1 ------------^ |
| // else if (shift_amt <= 31 && shift_amt > 0): |
| // multicycle_result = (rs1 << shift_amt[4:0]) | (rs3 >> shift_amt_compl[4:0]); |
| // ^-- cycle 0 ----------^ ^-- cycle 1 -------------------^ |
| // For shift_amt == 0, 32, both shift_amt[4:0] and shift_amt_compl[4:0] == '0. |
| // these cases need to be handled separately outside the shifting structure: |
| // else if (shift_amt == 32): |
| // multicycle_result = rs3 |
| // else if (shift_amt == 0): |
| // multicycle_result = rs1. |
| // |
| // Single-Bit Instructions |
| // ======================= |
| // Single bit instructions operate on bit operand_b_i[4:0] of operand_a_i. |
| |
| // The operations bset, bclr and binv are implemented by generation of a bit-mask using the |
| // shifter structure. This is done by left-shifting the operand 32'h1 by the required amount. |
| // The signal shift_sbmode multiplexes the shifter input and sets the signal shift_left. |
| // Further processing is taken care of by a separate structure. |
| // |
| // For bext, the bit defined by operand_b_i[4:0] is to be returned. This is done by simply |
| // shifting operand_a_i to the right by the required amount and returning bit [0] of the result. |
| // |
| // Bit-Field Place |
| // =============== |
| // The shifter structure is shared to compute bfp_mask << bfp_off. |
| |
| logic shift_left; |
| logic shift_ones; |
| logic shift_arith; |
| logic shift_funnel; |
| logic shift_sbmode; |
| logic [5:0] shift_amt; |
| logic [5:0] shift_amt_compl; // complementary shift amount (32 - shift_amt) |
| |
| logic [31:0] shift_operand; |
| logic signed [32:0] shift_result_ext_signed; |
| logic [32:0] shift_result_ext; |
| logic unused_shift_result_ext; |
| logic [31:0] shift_result; |
| logic [31:0] shift_result_rev; |
| |
| // zbf |
| logic bfp_op; |
| logic [4:0] bfp_len; |
| logic [4:0] bfp_off; |
| logic [31:0] bfp_mask; |
| logic [31:0] bfp_mask_rev; |
| logic [31:0] bfp_result; |
| |
| // bfp: shares the shifter structure to compute bfp_mask << bfp_off |
| assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0; |
| assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16 |
| assign bfp_off = operand_b_i[20:16]; |
| assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0; |
| for (genvar i = 0; i < 32; i++) begin : gen_rev_bfp_mask |
| assign bfp_mask_rev[i] = bfp_mask[31-i]; |
| end |
| |
| assign bfp_result =(RV32B != RV32BNone) ? |
| (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0; |
| |
| // bit shift_amt[5]: word swap bit: only considered for FSL/FSR. |
| // if set, reverse operations in first and second cycle. |
| assign shift_amt[5] = operand_b_i[5] & shift_funnel; |
| assign shift_amt_compl = 32 - operand_b_i[4:0]; |
| |
| always_comb begin |
| if (bfp_op) begin |
| shift_amt[4:0] = bfp_off; // length field of bfp control word |
| end else begin |
| shift_amt[4:0] = instr_first_cycle_i ? |
| (operand_b_i[5] && shift_funnel ? shift_amt_compl[4:0] : operand_b_i[4:0]) : |
| (operand_b_i[5] && shift_funnel ? operand_b_i[4:0] : shift_amt_compl[4:0]); |
| end |
| end |
| |
| // single-bit mode: shift |
| assign shift_sbmode = (RV32B != RV32BNone) ? |
| (operator_i == ALU_BSET) | (operator_i == ALU_BCLR) | (operator_i == ALU_BINV) : 1'b0; |
| |
| // left shift if this is: |
| // * a standard left shift (slo, sll) |
| // * a rol in the first cycle |
| // * a ror in the second cycle |
| // * fsl: without word-swap bit: first cycle, else: second cycle |
| // * fsr: without word-swap bit: second cycle, else: first cycle |
| // * a single-bit instruction: bclr, bset, binv (excluding bext) |
| // * bfp: bfp_mask << bfp_off |
| always_comb begin |
| unique case (operator_i) |
| ALU_SLL: shift_left = 1'b1; |
| ALU_SLO: shift_left = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b1 : 1'b0; |
| ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0; |
| ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0; |
| ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0; |
| ALU_FSL: shift_left = (RV32B != RV32BNone) ? |
| (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0; |
| ALU_FSR: shift_left = (RV32B != RV32BNone) ? |
| (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0; |
| default: shift_left = 1'b0; |
| endcase |
| if (shift_sbmode) begin |
| shift_left = 1'b1; |
| end |
| end |
| |
| assign shift_arith = (operator_i == ALU_SRA); |
| assign shift_ones = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? |
| (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0; |
| assign shift_funnel = (RV32B != RV32BNone) ? |
| (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0; |
| |
| // shifter structure. |
| always_comb begin |
| // select shifter input |
| // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen. |
| if (RV32B == RV32BNone) begin |
| shift_operand = shift_left ? operand_a_rev : operand_a_i; |
| end else begin |
| unique case (1'b1) |
| bfp_op: shift_operand = bfp_mask_rev; |
| shift_sbmode: shift_operand = 32'h8000_0000; |
| default: shift_operand = shift_left ? operand_a_rev : operand_a_i; |
| endcase |
| end |
| |
| shift_result_ext_signed = |
| $signed({shift_ones | (shift_arith & shift_operand[31]), shift_operand}) >>> shift_amt[4:0]; |
| shift_result_ext = $unsigned(shift_result_ext_signed); |
| |
| shift_result = shift_result_ext[31:0]; |
| unused_shift_result_ext = shift_result_ext[32]; |
| |
| for (int unsigned i = 0; i < 32; i++) begin |
| shift_result_rev[i] = shift_result[31-i]; |
| end |
| |
| shift_result = shift_left ? shift_result_rev : shift_result; |
| |
| end |
| |
| /////////////////// |
| // Bitwise Logic // |
| /////////////////// |
| |
| logic bwlogic_or; |
| logic bwlogic_and; |
| logic [31:0] bwlogic_operand_b; |
| logic [31:0] bwlogic_or_result; |
| logic [31:0] bwlogic_and_result; |
| logic [31:0] bwlogic_xor_result; |
| logic [31:0] bwlogic_result; |
| |
| logic bwlogic_op_b_negate; |
| |
| always_comb begin |
| unique case (operator_i) |
| // Logic-with-negate OPs (RV32B Ops) |
| ALU_XNOR, |
| ALU_ORN, |
| ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0; |
| ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0; |
| default: bwlogic_op_b_negate = 1'b0; |
| endcase |
| end |
| |
| assign bwlogic_operand_b = bwlogic_op_b_negate ? operand_b_neg[32:1] : operand_b_i; |
| |
| assign bwlogic_or_result = operand_a_i | bwlogic_operand_b; |
| assign bwlogic_and_result = operand_a_i & bwlogic_operand_b; |
| assign bwlogic_xor_result = operand_a_i ^ bwlogic_operand_b; |
| |
| assign bwlogic_or = (operator_i == ALU_OR) | (operator_i == ALU_ORN); |
| assign bwlogic_and = (operator_i == ALU_AND) | (operator_i == ALU_ANDN); |
| |
| always_comb begin |
| unique case (1'b1) |
| bwlogic_or: bwlogic_result = bwlogic_or_result; |
| bwlogic_and: bwlogic_result = bwlogic_and_result; |
| default: bwlogic_result = bwlogic_xor_result; |
| endcase |
| end |
| |
| logic [5:0] bitcnt_result; |
| logic [31:0] minmax_result; |
| logic [31:0] pack_result; |
| logic [31:0] sext_result; |
| logic [31:0] singlebit_result; |
| logic [31:0] rev_result; |
| logic [31:0] shuffle_result; |
| logic [31:0] xperm_result; |
| logic [31:0] butterfly_result; |
| logic [31:0] invbutterfly_result; |
| logic [31:0] clmul_result; |
| logic [31:0] multicycle_result; |
| |
| if (RV32B != RV32BNone) begin : g_alu_rvb |
| |
| ///////////////// |
| // Bitcounting // |
| ///////////////// |
| |
| // The bit-counter structure computes the number of set bits in its operand. Partial results |
| // (from left to right) are needed to compute the control masks for computation of |
| // bcompress/bdecompress by the butterfly network, if implemented. |
| // For cpop, clz and ctz, only the end result is used. |
| |
| logic zbe_op; |
| logic bitcnt_ctz; |
| logic bitcnt_clz; |
| logic bitcnt_cz; |
| logic [31:0] bitcnt_bits; |
| logic [31:0] bitcnt_mask_op; |
| logic [31:0] bitcnt_bit_mask; |
| logic [ 5:0] bitcnt_partial [32]; |
| logic [31:0] bitcnt_partial_lsb_d; |
| logic [31:0] bitcnt_partial_msb_d; |
| |
| |
| assign bitcnt_ctz = operator_i == ALU_CTZ; |
| assign bitcnt_clz = operator_i == ALU_CLZ; |
| assign bitcnt_cz = bitcnt_ctz | bitcnt_clz; |
| assign bitcnt_result = bitcnt_partial[31]; |
| |
| // Bit-mask generation for clz and ctz: |
| // The bit mask is generated by spreading the lowest-order set bit in the operand to all |
| // higher order bits. The resulting mask is inverted to cover the lowest order zeros. In order |
| // to create the bit mask for leading zeros, the input operand needs to be reversed. |
| assign bitcnt_mask_op = bitcnt_clz ? operand_a_rev : operand_a_i; |
| |
| always_comb begin |
| bitcnt_bit_mask = bitcnt_mask_op; |
| bitcnt_bit_mask |= bitcnt_bit_mask << 1; |
| bitcnt_bit_mask |= bitcnt_bit_mask << 2; |
| bitcnt_bit_mask |= bitcnt_bit_mask << 4; |
| bitcnt_bit_mask |= bitcnt_bit_mask << 8; |
| bitcnt_bit_mask |= bitcnt_bit_mask << 16; |
| bitcnt_bit_mask = ~bitcnt_bit_mask; |
| end |
| |
| assign zbe_op = (operator_i == ALU_BCOMPRESS) | (operator_i == ALU_BDECOMPRESS); |
| |
| always_comb begin |
| unique case (1'b1) |
| zbe_op: bitcnt_bits = operand_b_i; |
| bitcnt_cz: bitcnt_bits = bitcnt_bit_mask & ~bitcnt_mask_op; // clz / ctz |
| default: bitcnt_bits = operand_a_i; // cpop |
| endcase |
| end |
| |
| // The parallel prefix counter is of the structure of a Brent-Kung Adder. In the first |
| // log2(width) stages, the sum of the n preceding bit lines is computed for the bit lines at |
| // positions 2**n-1 (power-of-two positions) where n denotes the current stage. |
| // In stage n=log2(width), the count for position width-1 (the MSB) is finished. |
| // For the intermediate values, an inverse adder tree then computes the bit counts for the bit |
| // lines at positions |
| // m = 2**(n-1) + i*2**(n-2), where i = [1 ... width / 2**(n-1)-1] and n = [log2(width) ... 2]. |
| // Thus, at every subsequent stage the result of two previously unconnected sub-trees is |
| // summed, starting at the node summing bits [width/2-1 : 0] and [3*width/4-1: width/2] |
| // and moving to iteratively sum up all the sub-trees. |
| // The inverse adder tree thus features log2(width) - 1 stages the first of these stages is a |
| // single addition at position 3*width/4 - 1. It does not interfere with the last |
| // stage of the primary adder tree. These stages can thus be folded together, resulting in a |
| // total of 2*log2(width)-2 stages. |
| // For more details refer to R. Brent, H. T. Kung, "A Regular Layout for Parallel Adders", |
| // (1982). |
| // For a bitline at position p, only bits |
| // bitcnt_partial[max(i, such that p % log2(i) == 0)-1 : 0] are needed for generation of the |
| // butterfly network control signals. The adders in the intermediate value adder tree thus need |
| // not be full 5-bit adders. We leave the optimization to the synthesis tools. |
| // |
| // Consider the following 8-bit example for illustraton. |
| // |
| // let bitcnt_bits = 8'babcdefgh. |
| // |
| // a b c d e f g h |
| // | /: | /: | /: | /: |
| // |/ : |/ : |/ : |/ : |
| // stage 1: + : + : + : + : |
| // | : /: : | : /: : |
| // |,--+ : : |,--+ : : |
| // stage 2: + : : : + : : : |
| // | : | : /: : : : |
| // |,-----,--+ : : : : ^-primary adder tree |
| // stage 3: + : + : : : : : ------------------------- |
| // : | /| /| /| /| /| : ,-intermediate adder tree |
| // : |/ |/ |/ |/ |/ : : |
| // stage 4 : + + + + + : : |
| // : : : : : : : : |
| // bitcnt_partial[i] 7 6 5 4 3 2 1 0 |
| |
| always_comb begin |
| bitcnt_partial = '{default: '0}; |
| // stage 1 |
| for (int unsigned i = 1; i < 32; i += 2) begin |
| bitcnt_partial[i] = {5'h0, bitcnt_bits[i]} + {5'h0, bitcnt_bits[i-1]}; |
| end |
| // stage 2 |
| for (int unsigned i = 3; i < 32; i += 4) begin |
| bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i]; |
| end |
| // stage 3 |
| for (int unsigned i = 7; i < 32; i += 8) begin |
| bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i]; |
| end |
| // stage 4 |
| for (int unsigned i = 15; i < 32; i += 16) begin |
| bitcnt_partial[i] = bitcnt_partial[i-8] + bitcnt_partial[i]; |
| end |
| // stage 5 |
| bitcnt_partial[31] = bitcnt_partial[15] + bitcnt_partial[31]; |
| // ^- primary adder tree |
| // ------------------------------- |
| // ,-intermediate value adder tree |
| bitcnt_partial[23] = bitcnt_partial[15] + bitcnt_partial[23]; |
| |
| // stage 6 |
| for (int unsigned i = 11; i < 32; i += 8) begin |
| bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i]; |
| end |
| |
| // stage 7 |
| for (int unsigned i = 5; i < 32; i += 4) begin |
| bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i]; |
| end |
| // stage 8 |
| bitcnt_partial[0] = {5'h0, bitcnt_bits[0]}; |
| for (int unsigned i = 2; i < 32; i += 2) begin |
| bitcnt_partial[i] = bitcnt_partial[i-1] + {5'h0, bitcnt_bits[i]}; |
| end |
| end |
| |
| /////////////// |
| // Min / Max // |
| /////////////// |
| |
| assign minmax_result = cmp_result ? operand_a_i : operand_b_i; |
| |
| ////////// |
| // Pack // |
| ////////// |
| |
| logic packu; |
| logic packh; |
| assign packu = operator_i == ALU_PACKU; |
| assign packh = operator_i == ALU_PACKH; |
| |
| always_comb begin |
| unique case (1'b1) |
| packu: pack_result = {operand_b_i[31:16], operand_a_i[31:16]}; |
| packh: pack_result = {16'h0, operand_b_i[7:0], operand_a_i[7:0]}; |
| default: pack_result = {operand_b_i[15:0], operand_a_i[15:0]}; |
| endcase |
| end |
| |
| ////////// |
| // Sext // |
| ////////// |
| |
| assign sext_result = (operator_i == ALU_SEXTB) ? |
| { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]}; |
| |
| ///////////////////////////// |
| // Single-bit Instructions // |
| ///////////////////////////// |
| |
| always_comb begin |
| unique case (operator_i) |
| ALU_BSET: singlebit_result = operand_a_i | shift_result; |
| ALU_BCLR: singlebit_result = operand_a_i & ~shift_result; |
| ALU_BINV: singlebit_result = operand_a_i ^ shift_result; |
| default: singlebit_result = {31'h0, shift_result[0]}; // ALU_BEXT |
| endcase |
| end |
| |
| //////////////////////////////////// |
| // General Reverse and Or-combine // |
| //////////////////////////////////// |
| |
| // Only a subset of the general reverse and or-combine instructions are implemented in the |
| // balanced version of the B extension. Currently rev8 (shift_amt = 5'b11000) and orc.b |
| // (shift_amt = 5'b00111) are supported in the base extension. |
| |
| logic [4:0] zbp_shift_amt; |
| logic gorc_op; |
| |
| assign gorc_op = (operator_i == ALU_GORC); |
| assign zbp_shift_amt[2:0] = |
| (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[2:0] : {3{shift_amt[0]}}; |
| assign zbp_shift_amt[4:3] = |
| (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[4:3] : {2{shift_amt[3]}}; |
| |
| always_comb begin |
| rev_result = operand_a_i; |
| |
| if (zbp_shift_amt[0]) begin |
| rev_result = (gorc_op ? rev_result : 32'h0) | |
| ((rev_result & 32'h5555_5555) << 1) | |
| ((rev_result & 32'haaaa_aaaa) >> 1); |
| end |
| |
| if (zbp_shift_amt[1]) begin |
| rev_result = (gorc_op ? rev_result : 32'h0) | |
| ((rev_result & 32'h3333_3333) << 2) | |
| ((rev_result & 32'hcccc_cccc) >> 2); |
| end |
| |
| if (zbp_shift_amt[2]) begin |
| rev_result = (gorc_op ? rev_result : 32'h0) | |
| ((rev_result & 32'h0f0f_0f0f) << 4) | |
| ((rev_result & 32'hf0f0_f0f0) >> 4); |
| end |
| |
| if (zbp_shift_amt[3]) begin |
| rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) && |
| gorc_op ? rev_result : 32'h0) | |
| ((rev_result & 32'h00ff_00ff) << 8) | |
| ((rev_result & 32'hff00_ff00) >> 8); |
| end |
| |
| if (zbp_shift_amt[4]) begin |
| rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) && |
| gorc_op ? rev_result : 32'h0) | |
| ((rev_result & 32'h0000_ffff) << 16) | |
| ((rev_result & 32'hffff_0000) >> 16); |
| end |
| end |
| |
| logic crc_hmode; |
| logic crc_bmode; |
| logic [31:0] clmul_result_rev; |
| |
| if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin : gen_alu_rvb_otearlgrey_full |
| |
| ///////////////////////// |
| // Shuffle / Unshuffle // |
| ///////////////////////// |
| |
| localparam logic [31:0] SHUFFLE_MASK_L [4] = |
| '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444}; |
| localparam logic [31:0] SHUFFLE_MASK_R [4] = |
| '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222}; |
| |
| localparam logic [31:0] FLIP_MASK_L [4] = |
| '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000}; |
| localparam logic [31:0] FLIP_MASK_R [4] = |
| '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088}; |
| |
| logic [31:0] SHUFFLE_MASK_NOT [4]; |
| for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not |
| assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]); |
| end |
| |
| logic shuffle_flip; |
| assign shuffle_flip = operator_i == ALU_UNSHFL; |
| |
| logic [3:0] shuffle_mode; |
| |
| always_comb begin |
| shuffle_result = operand_a_i; |
| |
| if (shuffle_flip) begin |
| shuffle_mode[3] = shift_amt[0]; |
| shuffle_mode[2] = shift_amt[1]; |
| shuffle_mode[1] = shift_amt[2]; |
| shuffle_mode[0] = shift_amt[3]; |
| end else begin |
| shuffle_mode = shift_amt[3:0]; |
| end |
| |
| if (shuffle_flip) begin |
| shuffle_result = (shuffle_result & 32'h8822_4411) | |
| ((shuffle_result << 6) & FLIP_MASK_L[0]) | |
| ((shuffle_result >> 6) & FLIP_MASK_R[0]) | |
| ((shuffle_result << 9) & FLIP_MASK_L[1]) | |
| ((shuffle_result >> 9) & FLIP_MASK_R[1]) | |
| ((shuffle_result << 15) & FLIP_MASK_L[2]) | |
| ((shuffle_result >> 15) & FLIP_MASK_R[2]) | |
| ((shuffle_result << 21) & FLIP_MASK_L[3]) | |
| ((shuffle_result >> 21) & FLIP_MASK_R[3]); |
| end |
| |
| if (shuffle_mode[3]) begin |
| shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) | |
| (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) | |
| ((shuffle_result >> 8) & SHUFFLE_MASK_R[0])); |
| end |
| if (shuffle_mode[2]) begin |
| shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) | |
| (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) | |
| ((shuffle_result >> 4) & SHUFFLE_MASK_R[1])); |
| end |
| if (shuffle_mode[1]) begin |
| shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) | |
| (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) | |
| ((shuffle_result >> 2) & SHUFFLE_MASK_R[2])); |
| end |
| if (shuffle_mode[0]) begin |
| shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) | |
| (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) | |
| ((shuffle_result >> 1) & SHUFFLE_MASK_R[3])); |
| end |
| |
| if (shuffle_flip) begin |
| shuffle_result = (shuffle_result & 32'h8822_4411) | |
| ((shuffle_result << 6) & FLIP_MASK_L[0]) | |
| ((shuffle_result >> 6) & FLIP_MASK_R[0]) | |
| ((shuffle_result << 9) & FLIP_MASK_L[1]) | |
| ((shuffle_result >> 9) & FLIP_MASK_R[1]) | |
| ((shuffle_result << 15) & FLIP_MASK_L[2]) | |
| ((shuffle_result >> 15) & FLIP_MASK_R[2]) | |
| ((shuffle_result << 21) & FLIP_MASK_L[3]) | |
| ((shuffle_result >> 21) & FLIP_MASK_R[3]); |
| end |
| end |
| |
| ////////////// |
| // Crossbar // |
| ////////////// |
| // The crossbar permutation instructions xperm.[nbh] (Zbp) can be implemented using 8 |
| // parallel 4-bit-wide, 8-input crossbars. Basically, we permute the 8 nibbles of operand_a_i |
| // based on operand_b_i. |
| |
| // Generate selector indices and valid signals. |
| // - sel_n[x] indicates which nibble of operand_a_i is selected for output nibble x. |
| // - vld_n[x] indicates if the selection is valid. |
| logic [7:0][2:0] sel_n; // nibbles |
| logic [7:0] vld_n; // nibbles |
| logic [3:0][1:0] sel_b; // bytes |
| logic [3:0] vld_b; // bytes |
| logic [1:0][0:0] sel_h; // half words |
| logic [1:0] vld_h; // half words |
| |
| // Per nibble, 3 bits are needed for the selection. Other bits must be zero. |
| // sel_n bit mask: 32'b0111_0111_0111_0111_0111_0111_0111_0111 |
| // vld_n bit mask: 32'b1000_1000_1000_1000_1000_1000_1000_1000 |
| for (genvar i = 0; i < 8; i++) begin : gen_sel_vld_n |
| assign sel_n[i] = operand_b_i[i*4 +: 3]; |
| assign vld_n[i] = ~|operand_b_i[i*4 + 3 +: 1]; |
| end |
| |
| // Per byte, 2 bits are needed for the selection. Other bits must be zero. |
| // sel_b bit mask: 32'b0000_0011_0000_0011_0000_0011_0000_0011 |
| // vld_b bit mask: 32'b1111_1100_1111_1100_1111_1100_1111_1100 |
| for (genvar i = 0; i < 4; i++) begin : gen_sel_vld_b |
| assign sel_b[i] = operand_b_i[i*8 +: 2]; |
| assign vld_b[i] = ~|operand_b_i[i*8 + 2 +: 6]; |
| end |
| |
| // Per half word, 1 bit is needed for the selection only. All other bits must be zero. |
| // sel_h bit mask: 32'b0000_0000_0000_0001_0000_0000_0000_0001 |
| // vld_h bit mask: 32'b1111_1111_1111_1110_1111_1111_1111_1110 |
| for (genvar i = 0; i < 2; i++) begin : gen_sel_vld_h |
| assign sel_h[i] = operand_b_i[i*16 +: 1]; |
| assign vld_h[i] = ~|operand_b_i[i*16 + 1 +: 15]; |
| end |
| |
| // Convert selector indices and valid signals to control the nibble-based |
| // crossbar logic. |
| logic [7:0][2:0] sel; |
| logic [7:0] vld; |
| always_comb begin |
| unique case (operator_i) |
| ALU_XPERM_N: begin |
| // No conversion needed. |
| sel = sel_n; |
| vld = vld_n; |
| end |
| |
| ALU_XPERM_B: begin |
| // Convert byte to nibble indicies. |
| for (int b = 0; b < 4; b++) begin |
| sel[b*2 + 0] = {sel_b[b], 1'b0}; |
| sel[b*2 + 1] = {sel_b[b], 1'b1}; |
| vld[b*2 +: 2] = {2{vld_b[b]}}; |
| end |
| end |
| |
| ALU_XPERM_H: begin |
| // Convert half-word to nibble indices. |
| for (int h = 0; h < 2; h++) begin |
| sel[h*4 + 0] = {sel_h[h], 2'b00}; |
| sel[h*4 + 1] = {sel_h[h], 2'b01}; |
| sel[h*4 + 2] = {sel_h[h], 2'b10}; |
| sel[h*4 + 3] = {sel_h[h], 2'b11}; |
| vld[h*4 +: 4] = {4{vld_h[h]}}; |
| end |
| end |
| |
| default: begin |
| // Tie valid to zero to disable the crossbar unless we need it. |
| sel = sel_n; |
| vld = '0; |
| end |
| endcase |
| end |
| |
| // The actual nibble-based crossbar logic. |
| logic [7:0][3:0] val_n; |
| logic [7:0][3:0] xperm_n; |
| assign val_n = operand_a_i; |
| for (genvar i = 0; i < 8; i++) begin : gen_xperm_n |
| assign xperm_n[i] = vld[i] ? val_n[sel[i]] : '0; |
| end |
| assign xperm_result = xperm_n; |
| |
| /////////////////////////////////////////////////// |
| // Carry-less Multiply + Cyclic Redundancy Check // |
| /////////////////////////////////////////////////// |
| |
| // Carry-less multiplication can be understood as multiplication based on |
| // the addition interpreted as the bit-wise xor operation. |
| // |
| // Example: 1101 X 1011 = 1111111: |
| // |
| // 1011 X 1101 |
| // ----------- |
| // 1101 |
| // xor 1101 |
| // --------- |
| // 10111 |
| // xor 0000 |
| // ---------- |
| // 010111 |
| // xor 1101 |
| // ----------- |
| // 1111111 |
| // |
| // Architectural details: |
| // A 32 x 32-bit array |
| // [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ] |
| // is generated. The entries of the array are pairwise 'xor-ed' |
| // together in a 5-stage binary tree. |
| // |
| // |
| // Cyclic Redundancy Check: |
| // |
| // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For |
| // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.) |
| // see http://reveng.sourceforge.net/crc-catalogue/all.htm |
| // A useful guide to crc arithmetic and algorithms is given here: |
| // http://www.piclist.com/techref/method/math/crcguide.html. |
| // |
| // The CRC operation solves the following equation using binary polynomial arithmetic: |
| // |
| // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x) |
| // |
| // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal |
| // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation. |
| // |
| // Using barret reduction, one can show that |
| // |
| // M(x) mod P(x) = R(x) = |
| // (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x), |
| // |
| // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less |
| // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for |
| // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get |
| // |
| // rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P) |
| // = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P) |
| // ^-- cycle 0--------------------^ |
| // ^- cycle 1 -------------------------------------------^ |
| // |
| // In the last step we used the fact that carry-less multiplication is bit-order agnostic: |
| // rev(a cx b) = rev(a) cx rev(b). |
| |
| logic clmul_rmode; |
| logic clmul_hmode; |
| logic [31:0] clmul_op_a; |
| logic [31:0] clmul_op_b; |
| logic [31:0] operand_b_rev; |
| logic [31:0] clmul_and_stage[32]; |
| logic [31:0] clmul_xor_stage1[16]; |
| logic [31:0] clmul_xor_stage2[8]; |
| logic [31:0] clmul_xor_stage3[4]; |
| logic [31:0] clmul_xor_stage4[2]; |
| |
| logic [31:0] clmul_result_raw; |
| |
| for (genvar i = 0; i < 32; i++) begin : gen_rev_operand_b |
| assign operand_b_rev[i] = operand_b_i[31-i]; |
| end |
| |
| assign clmul_rmode = operator_i == ALU_CLMULR; |
| assign clmul_hmode = operator_i == ALU_CLMULH; |
| |
| // CRC |
| localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7; |
| localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641; |
| |
| localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41; |
| localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1; |
| |
| logic crc_op; |
| |
| logic crc_cpoly; |
| |
| logic [31:0] crc_operand; |
| logic [31:0] crc_poly; |
| logic [31:0] crc_mu_rev; |
| |
| assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) | |
| (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) | |
| (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B); |
| |
| assign crc_cpoly = (operator_i == ALU_CRC32C_W) | |
| (operator_i == ALU_CRC32C_H) | |
| (operator_i == ALU_CRC32C_B); |
| |
| assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H); |
| assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B); |
| |
| assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL; |
| assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV; |
| |
| always_comb begin |
| unique case (1'b1) |
| crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0}; |
| crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0}; |
| default: crc_operand = operand_a_i; |
| endcase |
| end |
| |
| // Select clmul input |
| always_comb begin |
| if (crc_op) begin |
| clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0]; |
| clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly; |
| end else begin |
| clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i; |
| clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i; |
| end |
| end |
| |
| for (genvar i = 0; i < 32; i++) begin : gen_clmul_and_op |
| assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0; |
| end |
| |
| for (genvar i = 0; i < 16; i++) begin : gen_clmul_xor_op_l1 |
| assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1]; |
| end |
| |
| for (genvar i = 0; i < 8; i++) begin : gen_clmul_xor_op_l2 |
| assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1]; |
| end |
| |
| for (genvar i = 0; i < 4; i++) begin : gen_clmul_xor_op_l3 |
| assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1]; |
| end |
| |
| for (genvar i = 0; i < 2; i++) begin : gen_clmul_xor_op_l4 |
| assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1]; |
| end |
| |
| assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1]; |
| |
| for (genvar i = 0; i < 32; i++) begin : gen_rev_clmul_result |
| assign clmul_result_rev[i] = clmul_result_raw[31-i]; |
| end |
| |
| // clmulr_result = rev(clmul(rev(a), rev(b))) |
| // clmulh_result = clmulr_result >> 1 |
| always_comb begin |
| unique case (1'b1) |
| clmul_rmode: clmul_result = clmul_result_rev; |
| clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]}; |
| default: clmul_result = clmul_result_raw; |
| endcase |
| end |
| end else begin : gen_alu_rvb_not_otearlgrey_full |
| assign shuffle_result = '0; |
| assign xperm_result = '0; |
| assign clmul_result = '0; |
| // support signals |
| assign clmul_result_rev = '0; |
| assign crc_bmode = '0; |
| assign crc_hmode = '0; |
| end |
| |
| if (RV32B == RV32BFull) begin : gen_alu_rvb_full |
| |
| /////////////// |
| // Butterfly // |
| /////////////// |
| |
| // The butterfly / inverse butterfly network executing bcompress/bdecompress (zbe) |
| // instructions. For bdecompress, the control bits mask of a local left region is generated |
| // by the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the |
| // number of ones in the deposit bitmask to the right of the segment. n hereby denotes the |
| // width of the according segment. The bitmask for a pertaining local right region is equal |
| // to the corresponding local left region. Bcompress uses an analogue inverse process. |
| // Consider the following 8-bit example. For details, see Hilewitz et al. "Fast Bit Gather, |
| // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008). |
| // |
| // The bcompress/bdecompress instructions are completed in 2 cycles. In the first cycle, the |
| // control bitmask is prepared by executing the parallel prefix bit count. In the second |
| // cycle, the bit swapping is executed according to the control masks. |
| |
| // 8-bit example: (Hilewitz et al.) |
| // Consider the instruction bdecompress operand_a_i deposit_mask |
| // Let operand_a_i = 8'babcd_efgh |
| // deposit_mask = 8'b1010_1101 |
| // |
| // control bitmask for stage 1: |
| // - number of ones in the right half of the deposit bitmask: 3 |
| // - width of the segment: 4 |
| // - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000 |
| // |
| // control bitmask: c3 c2 c1 c0 c3 c2 c1 c0 |
| // 1 0 0 0 1 0 0 0 |
| // <- L -----> <- R -----> |
| // operand_a_i a b c d e f g h |
| // :\ | | | /: | | | |
| // : +|---|--|-+ : | | | |
| // :/ | | | \: | | | |
| // stage 1 e b c d a f g h |
| // <L-> <R-> <L-> <R-> |
| // control bitmask: c3 c2 c3 c2 c1 c0 c1 c0 |
| // 1 1 1 1 1 0 1 0 |
| // :\ :\ /: /: :\ | /: | |
| // : +:-+-:+ : : +|-+ : | |
| // :/ :/ \: \: :/ | \: | |
| // stage 2 c d e b g f a h |
| // L R L R L R L R |
| // control bitmask: c3 c3 c2 c2 c1 c1 c0 c0 |
| // 1 1 0 0 1 1 0 0 |
| // :\/: | | :\/: | | |
| // : : | | : : | | |
| // :/\: | | :/\: | | |
| // stage 3 d c e b f g a h |
| // & deposit bitmask: 1 0 1 0 1 1 0 1 |
| // result: d 0 e 0 f g 0 h |
| |
| logic [ 5:0] bitcnt_partial_q [32]; |
| |
| // first cycle |
| // Store partial bitcnts |
| for (genvar i = 0; i < 32; i++) begin : gen_bitcnt_reg_in_lsb |
| assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0]; |
| end |
| |
| for (genvar i = 0; i < 16; i++) begin : gen_bitcnt_reg_in_b1 |
| assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1]; |
| end |
| |
| for (genvar i = 0; i < 8; i++) begin : gen_bitcnt_reg_in_b2 |
| assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2]; |
| end |
| |
| for (genvar i = 0; i < 4; i++) begin : gen_bitcnt_reg_in_b3 |
| assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3]; |
| end |
| |
| for (genvar i = 0; i < 2; i++) begin : gen_bitcnt_reg_in_b4 |
| assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4]; |
| end |
| |
| assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5]; |
| assign bitcnt_partial_msb_d[31] = 1'b0; // unused |
| |
| // Second cycle |
| // Load partial bitcnts |
| always_comb begin |
| bitcnt_partial_q = '{default: '0}; |
| |
| for (int unsigned i = 0; i < 32; i++) begin : gen_bitcnt_reg_out_lsb |
| bitcnt_partial_q[i][0] = imd_val_q_i[0][i]; |
| end |
| |
| for (int unsigned i = 0; i < 16; i++) begin : gen_bitcnt_reg_out_b1 |
| bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i]; |
| end |
| |
| for (int unsigned i = 0; i < 8; i++) begin : gen_bitcnt_reg_out_b2 |
| bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i]; |
| end |
| |
| for (int unsigned i = 0; i < 4; i++) begin : gen_bitcnt_reg_out_b3 |
| bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i]; |
| end |
| |
| for (int unsigned i = 0; i < 2; i++) begin : gen_bitcnt_reg_out_b4 |
| bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i]; |
| end |
| |
| bitcnt_partial_q[31][5] = imd_val_q_i[1][30]; |
| end |
| |
| logic [31:0] butterfly_mask_l[5]; |
| logic [31:0] butterfly_mask_r[5]; |
| logic [31:0] butterfly_mask_not[5]; |
| logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap |
| |
| // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage |
| `define _N(stg) (16 >> stg) |
| |
| // bcompress / bdecompress control bit generation |
| for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_ctrl_stage |
| // number of segs: 2** stg |
| for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl |
| |
| assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] = |
| {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} << |
| bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0]; |
| |
| assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] |
| = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; |
| |
| assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] |
| = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; |
| |
| assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] = '0; |
| assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0; |
| end |
| end |
| `undef _N |
| |
| for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_not |
| assign butterfly_mask_not[stg] = |
| ~(butterfly_mask_l[stg] | butterfly_mask_r[stg]); |
| end |
| |
| always_comb begin |
| butterfly_result = operand_a_i; |
| |
| butterfly_result = butterfly_result & butterfly_mask_not[0] | |
| ((butterfly_result & butterfly_mask_l[0]) >> 16)| |
| ((butterfly_result & butterfly_mask_r[0]) << 16); |
| |
| butterfly_result = butterfly_result & butterfly_mask_not[1] | |
| ((butterfly_result & butterfly_mask_l[1]) >> 8)| |
| ((butterfly_result & butterfly_mask_r[1]) << 8); |
| |
| butterfly_result = butterfly_result & butterfly_mask_not[2] | |
| ((butterfly_result & butterfly_mask_l[2]) >> 4)| |
| ((butterfly_result & butterfly_mask_r[2]) << 4); |
| |
| butterfly_result = butterfly_result & butterfly_mask_not[3] | |
| ((butterfly_result & butterfly_mask_l[3]) >> 2)| |
| ((butterfly_result & butterfly_mask_r[3]) << 2); |
| |
| butterfly_result = butterfly_result & butterfly_mask_not[4] | |
| ((butterfly_result & butterfly_mask_l[4]) >> 1)| |
| ((butterfly_result & butterfly_mask_r[4]) << 1); |
| |
| butterfly_result = butterfly_result & operand_b_i; |
| end |
| |
| always_comb begin |
| invbutterfly_result = operand_a_i & operand_b_i; |
| |
| invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] | |
| ((invbutterfly_result & butterfly_mask_l[4]) >> 1)| |
| ((invbutterfly_result & butterfly_mask_r[4]) << 1); |
| |
| invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] | |
| ((invbutterfly_result & butterfly_mask_l[3]) >> 2)| |
| ((invbutterfly_result & butterfly_mask_r[3]) << 2); |
| |
| invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] | |
| ((invbutterfly_result & butterfly_mask_l[2]) >> 4)| |
| ((invbutterfly_result & butterfly_mask_r[2]) << 4); |
| |
| invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] | |
| ((invbutterfly_result & butterfly_mask_l[1]) >> 8)| |
| ((invbutterfly_result & butterfly_mask_r[1]) << 8); |
| |
| invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] | |
| ((invbutterfly_result & butterfly_mask_l[0]) >> 16)| |
| ((invbutterfly_result & butterfly_mask_r[0]) << 16); |
| end |
| end else begin : gen_alu_rvb_not_full |
| logic [31:0] unused_imd_val_q_1; |
| assign unused_imd_val_q_1 = imd_val_q_i[1]; |
| assign butterfly_result = '0; |
| assign invbutterfly_result = '0; |
| // support signals |
| assign bitcnt_partial_lsb_d = '0; |
| assign bitcnt_partial_msb_d = '0; |
| end |
| |
| ////////////////////////////////////// |
| // Multicycle Bitmanip Instructions // |
| ////////////////////////////////////// |
| // Ternary instructions + Shift Rotations + Bit Compress/Decompress + CRC |
| // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the |
| // second cycle. operand_b_i is always tied to rs2. |
| |
| always_comb begin |
| unique case (operator_i) |
| ALU_CMOV: begin |
| multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0]; |
| imd_val_d_o = '{operand_a_i, 32'h0}; |
| if (instr_first_cycle_i) begin |
| imd_val_we_o = 2'b01; |
| end else begin |
| imd_val_we_o = 2'b00; |
| end |
| end |
| |
| ALU_CMIX: begin |
| multicycle_result = imd_val_q_i[0] | bwlogic_and_result; |
| imd_val_d_o = '{bwlogic_and_result, 32'h0}; |
| if (instr_first_cycle_i) begin |
| imd_val_we_o = 2'b01; |
| end else begin |
| imd_val_we_o = 2'b00; |
| end |
| end |
| |
| ALU_FSR, ALU_FSL, |
| ALU_ROL, ALU_ROR: begin |
| if (shift_amt[4:0] == 5'h0) begin |
| multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0]; |
| end else begin |
| multicycle_result = imd_val_q_i[0] | shift_result; |
| end |
| imd_val_d_o = '{shift_result, 32'h0}; |
| if (instr_first_cycle_i) begin |
| imd_val_we_o = 2'b01; |
| end else begin |
| imd_val_we_o = 2'b00; |
| end |
| end |
| |
| ALU_CRC32_W, ALU_CRC32C_W, |
| ALU_CRC32_H, ALU_CRC32C_H, |
| ALU_CRC32_B, ALU_CRC32C_B: begin |
| if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin |
| unique case (1'b1) |
| crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8); |
| crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16); |
| default: multicycle_result = clmul_result_rev; |
| endcase |
| imd_val_d_o = '{clmul_result_rev, 32'h0}; |
| if (instr_first_cycle_i) begin |
| imd_val_we_o = 2'b01; |
| end else begin |
| imd_val_we_o = 2'b00; |
| end |
| end else begin |
| imd_val_d_o = '{operand_a_i, 32'h0}; |
| imd_val_we_o = 2'b00; |
| multicycle_result = '0; |
| end |
| end |
| |
| ALU_BCOMPRESS, ALU_BDECOMPRESS: begin |
| if (RV32B == RV32BFull) begin |
| multicycle_result = (operator_i == ALU_BDECOMPRESS) ? butterfly_result : |
| invbutterfly_result; |
| imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d}; |
| if (instr_first_cycle_i) begin |
| imd_val_we_o = 2'b11; |
| end else begin |
| imd_val_we_o = 2'b00; |
| end |
| end else begin |
| imd_val_d_o = '{operand_a_i, 32'h0}; |
| imd_val_we_o = 2'b00; |
| multicycle_result = '0; |
| end |
| end |
| |
| default: begin |
| imd_val_d_o = '{operand_a_i, 32'h0}; |
| imd_val_we_o = 2'b00; |
| multicycle_result = '0; |
| end |
| endcase |
| end |
| |
| |
| end else begin : g_no_alu_rvb |
| logic [31:0] unused_imd_val_q[2]; |
| assign unused_imd_val_q = imd_val_q_i; |
| logic [31:0] unused_butterfly_result; |
| assign unused_butterfly_result = butterfly_result; |
| logic [31:0] unused_invbutterfly_result; |
| assign unused_invbutterfly_result = invbutterfly_result; |
| // RV32B result signals |
| assign bitcnt_result = '0; |
| assign minmax_result = '0; |
| assign pack_result = '0; |
| assign sext_result = '0; |
| assign singlebit_result = '0; |
| assign rev_result = '0; |
| assign shuffle_result = '0; |
| assign xperm_result = '0; |
| assign butterfly_result = '0; |
| assign invbutterfly_result = '0; |
| assign clmul_result = '0; |
| assign multicycle_result = '0; |
| // RV32B support signals |
| assign imd_val_d_o = '{default: '0}; |
| assign imd_val_we_o = '{default: '0}; |
| end |
| |
| //////////////// |
| // Result mux // |
| //////////////// |
| |
| always_comb begin |
| result_o = '0; |
| |
| unique case (operator_i) |
| // Bitwise Logic Operations (negate: RV32B) |
| ALU_XOR, ALU_XNOR, |
| ALU_OR, ALU_ORN, |
| ALU_AND, ALU_ANDN: result_o = bwlogic_result; |
| |
| // Adder Operations |
| ALU_ADD, ALU_SUB, |
| // RV32B |
| ALU_SH1ADD, ALU_SH2ADD, |
| ALU_SH3ADD: result_o = adder_result; |
| |
| // Shift Operations |
| ALU_SLL, ALU_SRL, |
| ALU_SRA, |
| // RV32B |
| ALU_SLO, ALU_SRO: result_o = shift_result; |
| |
| // Shuffle Operations (RV32B) |
| ALU_SHFL, ALU_UNSHFL: result_o = shuffle_result; |
| |
| // Crossbar Permutation Operations (RV32B) |
| ALU_XPERM_N, ALU_XPERM_B, ALU_XPERM_H: result_o = xperm_result; |
| |
| // Comparison Operations |
| ALU_EQ, ALU_NE, |
| ALU_GE, ALU_GEU, |
| ALU_LT, ALU_LTU, |
| ALU_SLT, ALU_SLTU: result_o = {31'h0,cmp_result}; |
| |
| // MinMax Operations (RV32B) |
| ALU_MIN, ALU_MAX, |
| ALU_MINU, ALU_MAXU: result_o = minmax_result; |
| |
| // Bitcount Operations (RV32B) |
| ALU_CLZ, ALU_CTZ, |
| ALU_CPOP: result_o = {26'h0, bitcnt_result}; |
| |
| // Pack Operations (RV32B) |
| ALU_PACK, ALU_PACKH, |
| ALU_PACKU: result_o = pack_result; |
| |
| // Sign-Extend (RV32B) |
| ALU_SEXTB, ALU_SEXTH: result_o = sext_result; |
| |
| // Ternary Bitmanip Operations (RV32B) |
| ALU_CMIX, ALU_CMOV, |
| ALU_FSL, ALU_FSR, |
| // Rotate Shift (RV32B) |
| ALU_ROL, ALU_ROR, |
| // Cyclic Redundancy Checks (RV32B) |
| ALU_CRC32_W, ALU_CRC32C_W, |
| ALU_CRC32_H, ALU_CRC32C_H, |
| ALU_CRC32_B, ALU_CRC32C_B, |
| // Bit Compress / Decompress (RV32B) |
| ALU_BCOMPRESS, ALU_BDECOMPRESS: result_o = multicycle_result; |
| |
| // Single-Bit Bitmanip Operations (RV32B) |
| ALU_BSET, ALU_BCLR, |
| ALU_BINV, ALU_BEXT: result_o = singlebit_result; |
| |
| // General Reverse / Or-combine (RV32B) |
| ALU_GREV, ALU_GORC: result_o = rev_result; |
| |
| // Bit Field Place (RV32B) |
| ALU_BFP: result_o = bfp_result; |
| |
| // Carry-less Multiply Operations (RV32B) |
| ALU_CLMUL, ALU_CLMULR, |
| ALU_CLMULH: result_o = clmul_result; |
| |
| default: ; |
| endcase |
| end |
| |
| logic unused_shift_amt_compl; |
| assign unused_shift_amt_compl = shift_amt_compl[5]; |
| |
| endmodule |