| // description |
| // 1. the pmtrdt_unit module is responsible for one PMTRDT instruction. |
| // |
| // feature list: |
| // 1. Compare/Reduction/Compress instruction is optional based on parameters. |
| // 2. the latency of all instructions is 2-cycles. |
| |
| `ifndef HDL_VERILOG_RVV_DESIGN_RVV_SVH |
| `include "rvv_backend.svh" |
| `endif |
| `ifndef RVV_ASSERT__SVH |
| `include "rvv_backend_sva.svh" |
| `endif |
| `ifndef PMTRDT_DEFINE_SVH |
| `include "rvv_backend_pmtrdt.svh" |
| `endif |
| |
| module rvv_backend_pmtrdt_unit |
| ( |
| clk, |
| rst_n, |
| |
| pmtrdt_uop_valid, |
| pmtrdt_uop, |
| pmtrdt_uop_ready, |
| |
| pmtrdt_res_valid, |
| pmtrdt_res, |
| pmtrdt_res_ready, |
| |
| uop_data, |
| uop_cnt, |
| trap_flush_rvv |
| ); |
| // ---parameter definition-------------------------------------------- |
| parameter GEN_RDT = 1'b0; // by default, NO Reduction unit |
| parameter GEN_CMP = 1'b0; // by default, NO COMPARE unit |
| parameter GEN_PMT = 1'b0; // by default, NO PERMUTATION unit |
| |
| localparam VLENB_WIDTH = $clog2(`VLENB); |
| // ---port definition------------------------------------------------- |
| // global signal |
| input logic clk; |
| input logic rst_n; |
| |
| // the uop from PMTRDT RS |
| input pmtrdt_uop_valid; |
| input PMT_RDT_RS_t pmtrdt_uop; |
| output logic pmtrdt_uop_ready; |
| |
| // the result to PMTRDT PU |
| output logic pmtrdt_res_valid; |
| output PU2ROB_t pmtrdt_res; |
| input pmtrdt_res_ready; |
| |
| // all uop from PMTRDT RS for permuation |
| input PMT_RDT_RS_t [`PMTRDT_RS_DEPTH-1:0] uop_data; |
| input [$clog2(`PMTRDT_RS_DEPTH):0] uop_cnt; |
| |
| // trap-flush |
| input trap_flush_rvv; |
| |
| // ---internal signal definition-------------------------------------- |
| // control signal |
| PMTRDT_UOP_TYPE_t uop_type, uop_type_q; |
| |
| RDT_CTRL_t rdt_ctrl, rdt_ctrl_q; // RDT+CMP control signals |
| logic rdt_ctrl_reg_en, rdt_ctrl_reg_clr; |
| |
| PMT_CTRL_t pmt_ctrl, pmt_ctrl_q; // PMT control signals |
| logic pmt_ctrl_reg_en, pmt_ctrl_reg_clr; |
| |
| COMPRESS_CTRL_t compress_ctrl_ex0, compress_ctrl_ex1; |
| logic compress_ctrl_push, compress_ctrl_pop; |
| logic compress_ctrl_empty; |
| |
| // Reduction operation |
| logic red_widen_sum_flag; |
| logic [`VLEN-1:0] widen_vs2; // vs2 data after being widen if need |
| BYTE_TYPE_t widen_vs2_type; // vs2 data btpe type after being widen if need |
| logic [`VLENB/2-1:0][7:0] logic_src1_1stage, logic_src2_1stage; // and/or/xor operation: source value for reduction vs2[*] |
| logic [`VLENB/2-1:0][8:0] sum_src1_1stage, sum_src2_1stage; // sum operation: source value for reduction vs2[*] |
| logic [`VLENB/2-1:0] sum_cin_1stage; |
| logic [`VLENB/2-1:0][8:0] max_src1_1stage, max_src2_1stage; // max operation: source value for reduction vs2[*] |
| logic [`VLENB/2-1:0] max_cin_1stage; |
| logic [`VLENB/2-1:0][8:0] min_src1_1stage, min_src2_1stage; // min operation: source value for reduction vs2[*] |
| logic [`VLENB/2-1:0] min_cin_1stage; |
| logic [`VLENB/2-1:0][8:0] sum_res_1stage, max_res_1stage, min_res_1stage; |
| logic [`VLENB/2-1:0][7:0] and_1stage, or_1stage, xor_1stage; |
| logic [`VLENB/2-1:0] less_than_1stage, great_than_1stage; |
| logic [`VLENB/4-1:0][8:0] sum_src1_2stage, sum_src2_2stage; // sum operation: source value for reduction sum_res_1stage[*] |
| logic [`VLENB/4-1:0] sum_cin_2stage; |
| logic [`VLENB/4-1:0][8:0] max_src1_2stage, max_src2_2stage; // max/min operation: source value for reduction max_res_1stage[*] |
| logic [`VLENB/4-1:0] max_cin_2stage; |
| logic [`VLENB/4-1:0][8:0] min_src1_2stage, min_src2_2stage; // max/min operation: source value for reduction min_res_1stage[*] |
| logic [`VLENB/4-1:0] min_cin_2stage; |
| logic [`VLENB/4-1:0][8:0] sum_res_2stage, max_res_2stage, min_res_2stage; |
| logic [`VLENB/4-1:0][7:0] and_2stage, or_2stage, xor_2stage; |
| logic [`VLENB/4-1:0] less_than_2stage, great_than_2stage; |
| logic sel_vs1; // operate vs1[0] if the last operation for reduction instruction |
| logic [3:0][8:0] src1_vd_1stage, src2_vs1_1stage; // source value for reduction vs1[0] & vd[0] |
| logic [3:0] carry_in_vd_1stage; |
| logic [3:0][8:0] sum_vd_1stage; |
| logic [3:0][7:0] and_vd_1stage, or_vd_1stage, xor_vd_1stage; |
| logic [3:0] less_than_vd_1stage, great_than_vd_1stage; |
| logic [3:0][8:0] src1_vd_2stage, src2_vs1_2stage; // source value for reduction vs1[0] & vd[0] |
| logic [3:0] carry_in_vd_2stage; |
| logic [3:0][8:0] sum_vd_2stage; |
| logic [3:0][7:0] and_vd_2stage, or_vd_2stage, xor_vd_2stage; |
| logic [3:0] less_than_vd_2stage, great_than_vd_2stage; |
| logic [`VLENB/4-1:0][7:0] max_res_ex0, min_res_ex0; |
| logic [`VLENB/4-1:0][7:0] sum_res_ex1, max_res_ex1, min_res_ex1, and_res_ex1, or_res_ex1, xor_res_ex1; |
| logic [3:0][7:0] max_vs1_ex0, min_vs1_ex0; |
| logic [3:0][7:0] sum_vs1_ex1, max_vs1_ex1, min_vs1_ex1, and_vs1_ex1, or_vs1_ex1, xor_vs1_ex1; |
| logic red_res_en; |
| logic [7:0] sum_8b, max_8b, min_8b, and_8b, or_8b, xor_8b; |
| logic [15:0] sum_16b, max_16b, min_16b, and_16b, or_16b, xor_16b; |
| logic [31:0] sum_32b, max_32b, min_32b, and_32b, or_32b, xor_32b; |
| logic [1:0][15:0] max_16b_1stage, min_16b_1stage; |
| logic [3:0][7:0] max_8b_1stage, min_8b_1stage; |
| logic [`VLEN-1:0] pmtrdt_res_red; // pmtrdt result of reduction |
| // Comparation operation |
| logic [`VSTART_WIDTH-1:0] cmp_vstart_d, cmp_vstart_q; |
| logic cmp_vstart_en; |
| logic [`VLENB-1:0][8:0] cmp_src1, cmp_src2; // source value for reduction/compare |
| logic [`VLENB-1:0] in_data, cin_data, bin_data; // vmadc/vmsbc mask data |
| logic [`VLENB-1:0] cmp_carry_in; |
| logic [`VLENB-1:0][8:0] cmp_sum; |
| logic [`VLENB-1:0] less_than, great_than_equal, equal, not_equal, out_data; |
| logic [`VLENB-1:0] cmp_res; |
| logic [`VSTART_WIDTH-1:0] cmp_res_offset, cmp_res_en_offset; |
| logic [`VLEN-1:0] cmp_res_d, cmp_res_q; |
| logic [2*`VLENB-1:0] cmp_res_en; |
| logic [`VLEN-1:0] pmtrdt_res_cmp; // pmtrdt result of compare |
| // Permutation operation |
| // slide+gather instruction |
| logic [`PMTRDT_RS_DEPTH-1:0] rs_entry_valid; |
| logic pmt_go, pmt_go_q; // start to execute pmt inst when all uop(s) are in RS |
| logic [`UOP_INDEX_WIDTH-1:0] pmt_uop_done_cnt_d, pmt_uop_done_cnt_q; |
| logic [`VLENB-1:0][`XLEN+1:0] offset; |
| logic [`VLENB-1:0] sel_scalar; |
| BYTE_TYPE_t vd_type; |
| logic [`VLMAX_MAX-1:0][7:0] pmt_vs2_data, pmt_vs3_data; |
| logic [`XLEN-1:0] pmt_rs1_data; |
| logic [`VLENB-1:0][7:0] pmt_res_d, pmt_res_q; |
| logic pmt_res_en; |
| logic [`VLEN-1:0] pmtrdt_res_pmt; // pmtrdt result of permutation |
| // compress instruction |
| logic [`VLENB-1:0] compress_enable, compress_body; |
| logic [`VLENB-1:0][VLENB_WIDTH:0] compress_offset; |
| logic [`VLEN-1:0] compress_mask_d, compress_mask_q; // register vs1_data for compress mask |
| logic compress_mask_en; |
| logic [VLENB_WIDTH:0] compress_cnt_d, compress_cnt_q, compress_cnt_qq; // compress counter |
| logic [1:0][VLENB_WIDTH:0] valid_num; |
| logic compress_cnt_ge_vlenb, compress_cnt_gt_vlenb; |
| logic compress_cnt_en, compress_cnt_clr; |
| logic [`VLENB-1:0][7:0] compress_value; |
| logic [2*`VLENB-1:0][7:0] compress_res_d, compress_res_q; |
| logic [2*`VLENB-1:0] compress_res_en; |
| logic [`VLEN-1:0] pmtrdt_res_compress; // pmtrdt result of vcompress instruction |
| |
| genvar i; |
| // ---code start------------------------------------------------------ |
| // control signals based on uop |
| // uop_type: permutation, reduction or compare |
| always_comb begin |
| case (pmtrdt_uop.uop_exe_unit) |
| PMT: uop_type = PERMUTATION; |
| RDT: uop_type = REDUCTION; |
| default: uop_type = COMPARE; |
| endcase |
| end |
| logic uop_type_reg_en, uop_type_reg_clr; |
| assign uop_type_reg_en = pmtrdt_uop_valid & pmtrdt_uop_ready; |
| assign uop_type_reg_clr = rdt_ctrl_q.compress ? !rdt_ctrl_reg_en & rdt_ctrl_q.last_uop_valid & compress_ctrl_ex1.last_uop_valid |
| : !rdt_ctrl_reg_en & rdt_ctrl_q.last_uop_valid; |
| cdffr #(.T(PMTRDT_UOP_TYPE_t)) uop_type_reg (.q(uop_type_q), .d(uop_type), .c(uop_type_reg_clr | trap_flush_rvv), .e(uop_type_reg_en), .clk(clk), .rst_n(rst_n)); |
| |
| // rdt control signals |
| // sign_opr: 0-unsigned, 1-signed |
| always_comb begin |
| case (pmtrdt_uop.uop_funct6) |
| VMADC, |
| VMSBC, |
| VMSLTU, |
| VMSLEU, |
| VMSGTU, |
| VREDMAXU, |
| VREDMINU, |
| VWREDSUMU: rdt_ctrl.sign_opr = 1'b0; |
| default : rdt_ctrl.sign_opr = 1'b1; |
| endcase |
| end |
| |
| // cmp_opr: great than / less than / equal / carry_out / borrow_out |
| always_comb begin |
| case (pmtrdt_uop.uop_funct6) |
| VMSEQ: rdt_ctrl.cmp_opr = EQUAL; |
| VMSNE: rdt_ctrl.cmp_opr = NOT_EQUAL; |
| VMSLTU, |
| VMSLT: rdt_ctrl.cmp_opr = LESS_THAN; |
| VMSLEU, |
| VMSLE: rdt_ctrl.cmp_opr = LESS_THAN_OR_EQUAL; |
| VMSGTU, |
| VMSGT: rdt_ctrl.cmp_opr = GREAT_THAN; |
| VMADC: rdt_ctrl.cmp_opr = COUT; |
| VMSBC: rdt_ctrl.cmp_opr = BOUT; |
| default: rdt_ctrl.cmp_opr = NOT_EQUAL; |
| endcase |
| end |
| |
| // widen: vd EEW = 2*SEW |
| assign rdt_ctrl.widen = (pmtrdt_uop.uop_funct6 == VWREDSUMU) || |
| (pmtrdt_uop.uop_funct6 == VWREDSUM); |
| |
| // rdt_opr: reduction operation |
| always_comb begin |
| case (pmtrdt_uop.uop_funct6) |
| VREDSUM, |
| VWREDSUMU, |
| VWREDSUM: rdt_ctrl.rdt_opr = SUM; |
| VREDMAXU, |
| VREDMAX: rdt_ctrl.rdt_opr = MAX; |
| VREDMINU, |
| VREDMIN: rdt_ctrl.rdt_opr = MIN; |
| VREDAND: rdt_ctrl.rdt_opr = AND; |
| VREDOR: rdt_ctrl.rdt_opr = OR; |
| VREDXOR: rdt_ctrl.rdt_opr = XOR; |
| default: rdt_ctrl.rdt_opr = SUM; |
| endcase |
| end |
| |
| assign rdt_ctrl.compress = pmtrdt_uop.uop_exe_unit == PMT && pmtrdt_uop.uop_funct6 == VCOMPRESS; |
| |
| // uop infomation |
| `ifdef TB_SUPPORT |
| assign rdt_ctrl.uop_pc = pmtrdt_uop.uop_pc; |
| `endif |
| assign rdt_ctrl.rob_entry = pmtrdt_uop.rob_entry; |
| assign rdt_ctrl.vl = pmtrdt_uop.vl; |
| assign rdt_ctrl.vm = pmtrdt_uop.vm; |
| assign rdt_ctrl.vs1_eew = pmtrdt_uop.vs1_eew; |
| assign rdt_ctrl.v0_data = pmtrdt_uop.v0_data; |
| assign rdt_ctrl.vs3_data = pmtrdt_uop.vs3_data; |
| assign rdt_ctrl.last_uop_valid = pmtrdt_uop.last_uop_valid; |
| |
| // cmp_evl |
| // prestart element: undisturbed |
| // body element: |
| // active element: updated |
| // inactive element: undisturbed |
| // tail element: |
| // tail element in CMP-unit: updated |
| // tail element not in CMP-unit: disturbed |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: rdt_ctrl.cmp_evl = pmtrdt_uop.uop_index * (`VLENB/4) + (`VLENB/4); |
| EEW16: rdt_ctrl.cmp_evl = pmtrdt_uop.uop_index * (`VLENB/2) + (`VLENB/2); |
| default:rdt_ctrl.cmp_evl = pmtrdt_uop.uop_index * `VLENB + `VLENB; |
| endcase |
| end |
| |
| // when to clear rdt_ctrl reg? |
| // if ex0 stage has no uop to execute! |
| assign rdt_ctrl_reg_en = pmtrdt_uop_valid & pmtrdt_uop_ready; |
| assign rdt_ctrl_reg_clr = rdt_ctrl_q.compress ? !rdt_ctrl_reg_en & rdt_ctrl_q.last_uop_valid & compress_ctrl_ex1.last_uop_valid |
| : !rdt_ctrl_reg_en & rdt_ctrl_q.last_uop_valid; |
| cdffr #(.T(RDT_CTRL_t)) rdt_ctrl_reg (.q(rdt_ctrl_q), .d(rdt_ctrl), .c(rdt_ctrl_reg_clr | trap_flush_rvv), .e(rdt_ctrl_reg_en), .clk(clk), .rst_n(rst_n)); |
| |
| // pmt_opr: permutation operation |
| always_comb begin |
| case (pmtrdt_uop.uop_funct6) |
| // VSLIDE1UP == VSLIDEUP_RGATHEREI16 |
| VSLIDE1UP: pmt_ctrl.pmt_opr = pmtrdt_uop.uop_funct3 == OPIVV ? GATHER : SLIDE_UP; |
| //VSLIDEDOWN == VSLIDE1DOWN |
| VSLIDE1DOWN:pmt_ctrl.pmt_opr = SLIDE_DOWN; |
| VRGATHER: pmt_ctrl.pmt_opr = GATHER; |
| default: pmt_ctrl.pmt_opr = GATHER; |
| endcase |
| end |
| |
| // uop infomation |
| `ifdef TB_SUPPORT |
| assign pmt_ctrl.uop_pc = uop_data[pmt_uop_done_cnt_q].uop_pc; |
| `endif |
| assign pmt_ctrl.rob_entry = uop_data[pmt_uop_done_cnt_q].rob_entry; |
| assign pmt_ctrl.vs3_data = uop_data[pmt_uop_done_cnt_q].vs3_data; |
| |
| assign pmt_ctrl_reg_en = pmt_go; |
| assign pmt_ctrl_reg_clr = !pmt_ctrl_reg_en; |
| cdffr #(.T(PMT_CTRL_t)) pmt_ctrl_reg (.q(pmt_ctrl_q), .d(pmt_ctrl), .c(pmt_ctrl_reg_clr | trap_flush_rvv), .e(pmt_ctrl_reg_en), .clk(clk), .rst_n(rst_n)); |
| |
| // Reduction unit |
| generate |
| if (GEN_RDT == 1'b1) begin |
| // logic_src1_1stage/logic_src2_1stage data for bit manipulation: and/or/xor |
| for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_logic_src_bit_data |
| // logic_src2_1stage data |
| always_comb begin |
| case (rdt_ctrl.rdt_opr) |
| AND:begin |
| logic_src2_1stage[4*i] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'hFF; |
| logic_src2_1stage[4*i+1] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : 8'hFF; |
| logic_src2_1stage[4*i+2] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'hFF; |
| logic_src2_1stage[4*i+3] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : 8'hFF; |
| end |
| default:begin |
| logic_src2_1stage[4*i] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'h00; |
| logic_src2_1stage[4*i+1] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : 8'h00; |
| logic_src2_1stage[4*i+2] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'h00; |
| logic_src2_1stage[4*i+3] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : 8'h00; |
| end |
| endcase |
| end |
| |
| // logic_src1_1stage data |
| always_comb begin |
| case (rdt_ctrl.rdt_opr) |
| AND:begin |
| logic_src1_1stage[4*i] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : 8'hFF; |
| logic_src1_1stage[4*i+1] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : 8'hFF; |
| logic_src1_1stage[4*i+2] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : 8'hFF; |
| logic_src1_1stage[4*i+3] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : 8'hFF; |
| end |
| default:begin |
| logic_src1_1stage[4*i] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : 8'h00; |
| logic_src1_1stage[4*i+1] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : 8'h00; |
| logic_src1_1stage[4*i+2] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : 8'h00; |
| logic_src1_1stage[4*i+3] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : 8'h00; |
| end |
| endcase |
| end |
| end //for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_logic_src_bit_data |
| |
| // widen vs2 data & widen vs2 eew |
| always_comb begin |
| if (rdt_ctrl.widen) begin |
| if (red_widen_sum_flag) begin // select high part of vs2_data |
| case(pmtrdt_uop.vs2_eew) |
| EEW16:begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| widen_vs2[16*(2*j)+:16] = pmtrdt_uop.vs2_data[(`VLEN/2+16*j)+:16]; |
| widen_vs2[16*(2*j+1)+:16] = rdt_ctrl.sign_opr ? {16{pmtrdt_uop.vs2_data[`VLEN/2+16*(j+1)-1]}} |
| : '0; |
| widen_vs2_type[4*j] = pmtrdt_uop.vs2_type[`VLENB/2+2*j]; |
| widen_vs2_type[4*j+1] = pmtrdt_uop.vs2_type[`VLENB/2+2*j+1]; |
| widen_vs2_type[4*j+2] = pmtrdt_uop.vs2_type[`VLENB/2+2*j]; |
| widen_vs2_type[4*j+3] = pmtrdt_uop.vs2_type[`VLENB/2+2*j+1]; |
| end |
| end |
| default:begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| widen_vs2[8*(2*j)+:8] = pmtrdt_uop.vs2_data[(`VLEN/2+8*j)+:8]; |
| widen_vs2[8*(2*j+1)+:8] = rdt_ctrl.sign_opr ? {8{pmtrdt_uop.vs2_data[`VLEN/2+8*(j+1)-1]}} |
| : '0; |
| widen_vs2_type[2*j] = pmtrdt_uop.vs2_type[`VLENB/2+j]; |
| widen_vs2_type[2*j+1] = pmtrdt_uop.vs2_type[`VLENB/2+j]; |
| end |
| end |
| endcase |
| end else begin // select low part of vs2_data |
| case(pmtrdt_uop.vs2_eew) |
| EEW16:begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| widen_vs2[16*(2*j)+:16] = pmtrdt_uop.vs2_data[(16*j)+:16]; |
| widen_vs2[16*(2*j+1)+:16] = rdt_ctrl.sign_opr ? {16{pmtrdt_uop.vs2_data[16*(j+1)-1]}} |
| : '0; |
| widen_vs2_type[4*j] = pmtrdt_uop.vs2_type[2*j]; |
| widen_vs2_type[4*j+1] = pmtrdt_uop.vs2_type[2*j+1]; |
| widen_vs2_type[4*j+2] = pmtrdt_uop.vs2_type[2*j]; |
| widen_vs2_type[4*j+3] = pmtrdt_uop.vs2_type[2*j+1]; |
| end |
| end |
| default:begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| widen_vs2[8*(2*j)+:8] = pmtrdt_uop.vs2_data[(8*j)+:8]; |
| widen_vs2[8*(2*j+1)+:8] = rdt_ctrl.sign_opr ? {8{pmtrdt_uop.vs2_data[8*(j+1)-1]}} |
| : '0; |
| widen_vs2_type[2*j] = pmtrdt_uop.vs2_type[j]; |
| widen_vs2_type[2*j+1] = pmtrdt_uop.vs2_type[j]; |
| end |
| end |
| endcase |
| end |
| end else begin |
| widen_vs2 = pmtrdt_uop.vs2_data; |
| widen_vs2_type = pmtrdt_uop.vs2_type; |
| end |
| end |
| // sum_src1_1stage/sum_src2_1stage/sum_cin_1stage data |
| for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_sum_src_1stage_data |
| // sum_src2_1stage data |
| always_comb begin |
| sum_src2_1stage[4*i][7:0] = widen_vs2_type[8*i] == BODY_ACTIVE ? widen_vs2[8*(8*i)+:8] : 8'h00; |
| sum_src2_1stage[4*i+1][7:0] = widen_vs2_type[8*i+1] == BODY_ACTIVE ? widen_vs2[8*(8*i+1)+:8] : 8'h00; |
| sum_src2_1stage[4*i+2][7:0] = widen_vs2_type[8*i+2] == BODY_ACTIVE ? widen_vs2[8*(8*i+2)+:8] : 8'h00; |
| sum_src2_1stage[4*i+3][7:0] = widen_vs2_type[8*i+3] == BODY_ACTIVE ? widen_vs2[8*(8*i+3)+:8] : 8'h00; |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| sum_src2_1stage[4*i][8] = 1'b0; |
| sum_src2_1stage[4*i+1][8] = 1'b0; |
| sum_src2_1stage[4*i+2][8] = 1'b0; |
| sum_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| sum_src2_1stage[4*i][8] = 1'b0; |
| sum_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+1][7] : 1'b0; |
| sum_src2_1stage[4*i+2][8] = 1'b0; |
| sum_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| sum_src2_1stage[4*i][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i][7] : 1'b0; |
| sum_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+1][7] : 1'b0; |
| sum_src2_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+2][7] : 1'b0; |
| sum_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // sum_src1_1stage data |
| always_comb begin |
| sum_src1_1stage[4*i][7:0] = widen_vs2_type[8*i+4] == BODY_ACTIVE ? widen_vs2[8*(8*i+4)+:8] : 8'h00; |
| sum_src1_1stage[4*i+1][7:0] = widen_vs2_type[8*i+5] == BODY_ACTIVE ? widen_vs2[8*(8*i+5)+:8] : 8'h00; |
| sum_src1_1stage[4*i+2][7:0] = widen_vs2_type[8*i+6] == BODY_ACTIVE ? widen_vs2[8*(8*i+6)+:8] : 8'h00; |
| sum_src1_1stage[4*i+3][7:0] = widen_vs2_type[8*i+7] == BODY_ACTIVE ? widen_vs2[8*(8*i+7)+:8] : 8'h00; |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| sum_src1_1stage[4*i][8] = 1'b0; |
| sum_src1_1stage[4*i+1][8] = 1'b0; |
| sum_src1_1stage[4*i+2][8] = 1'b0; |
| sum_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| sum_src1_1stage[4*i][8] = 1'b0; |
| sum_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+1][7] : 1'b0; |
| sum_src1_1stage[4*i+2][8] = 1'b0; |
| sum_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| sum_src1_1stage[4*i][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i][7] : 1'b0; |
| sum_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+1][7] : 1'b0; |
| sum_src1_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+2][7] : 1'b0; |
| sum_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_1stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // sum_cin_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| sum_cin_1stage[4*i] = 1'b0; |
| sum_cin_1stage[4*i+1] = sum_res_1stage[4*i][8]; |
| sum_cin_1stage[4*i+2] = sum_res_1stage[4*i+1][8]; |
| sum_cin_1stage[4*i+3] = sum_res_1stage[4*i+2][8]; |
| end |
| EEW16:begin |
| sum_cin_1stage[4*i] = 1'b0; |
| sum_cin_1stage[4*i+1] = sum_res_1stage[4*i][8]; |
| sum_cin_1stage[4*i+2] = 1'b0; |
| sum_cin_1stage[4*i+3] = sum_res_1stage[4*i+2][8]; |
| end |
| default:begin |
| sum_cin_1stage[4*i] = 1'b0; |
| sum_cin_1stage[4*i+1] = 1'b0; |
| sum_cin_1stage[4*i+2] = 1'b0; |
| sum_cin_1stage[4*i+3] = 1'b0; |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_sum_src_1stage_data |
| |
| // max_src1_1stage/max_src2_1stage data |
| for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_max_src_1stage_data |
| // max_src2_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: begin |
| max_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'h00; |
| max_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : 8'h00; |
| max_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'h00; |
| max_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| end |
| EEW16: begin |
| max_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'h00; |
| max_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| max_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'h00; |
| max_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| end |
| default: begin |
| max_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| max_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| max_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| max_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h80 : 8'h00); |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| max_src2_1stage[4*i][8] = 1'b0; |
| max_src2_1stage[4*i+1][8] = 1'b0; |
| max_src2_1stage[4*i+2][8] = 1'b0; |
| max_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| max_src2_1stage[4*i][8] = 1'b0; |
| max_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+1][7] : 1'b0; |
| max_src2_1stage[4*i+2][8] = 1'b0; |
| max_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| max_src2_1stage[4*i][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i][7] : 1'b0; |
| max_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+1][7] : 1'b0; |
| max_src2_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+2][7] : 1'b0; |
| max_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // max_src1_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : ~8'h00; |
| max_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : ~8'h00; |
| max_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : ~8'h00; |
| max_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| end |
| EEW16:begin |
| max_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : ~8'h00; |
| max_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| max_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : ~8'h00; |
| max_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| end |
| default:begin |
| max_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| max_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| max_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| max_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00); |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_src1_1stage[4*i][8] = 1'b0; |
| max_src1_1stage[4*i+1][8] = 1'b0; |
| max_src1_1stage[4*i+2][8] = 1'b0; |
| max_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| EEW16:begin |
| max_src1_1stage[4*i][8] = 1'b0; |
| max_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+1][7] : ~1'b0; |
| max_src1_1stage[4*i+2][8] = 1'b0; |
| max_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| default:begin |
| max_src1_1stage[4*i][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i][7] : ~1'b0; |
| max_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+1][7] : ~1'b0; |
| max_src1_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+2][7] : ~1'b0; |
| max_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| endcase |
| end |
| |
| // max_cin_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_cin_1stage[4*i] = 1'b1; |
| max_cin_1stage[4*i+1] = max_res_1stage[4*i][8]; |
| max_cin_1stage[4*i+2] = max_res_1stage[4*i+1][8]; |
| max_cin_1stage[4*i+3] = max_res_1stage[4*i+2][8]; |
| end |
| EEW16:begin |
| max_cin_1stage[4*i] = 1'b1; |
| max_cin_1stage[4*i+1] = max_res_1stage[4*i][8]; |
| max_cin_1stage[4*i+2] = 1'b1; |
| max_cin_1stage[4*i+3] = max_res_1stage[4*i+2][8]; |
| end |
| default:begin |
| max_cin_1stage[4*i] = 1'b1; |
| max_cin_1stage[4*i+1] = 1'b1; |
| max_cin_1stage[4*i+2] = 1'b1; |
| max_cin_1stage[4*i+3] = 1'b1; |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_max_src_1stage_data |
| |
| // min_src1_1stage/min_src2_1stage data |
| for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_min_src_1stage_data |
| // min_src2_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: begin |
| min_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'hFF; |
| min_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : 8'hFF; |
| min_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'hFF; |
| min_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| end |
| EEW16: begin |
| min_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : 8'hFF; |
| min_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| min_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : 8'hFF; |
| min_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| end |
| default: begin |
| min_src2_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| min_src2_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+1] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+1)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| min_src2_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+2] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+2)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| min_src2_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+3] == BODY_ACTIVE ? pmtrdt_uop.vs2_data[8*(8*i+3)+:8] : (rdt_ctrl.sign_opr ? 8'h7F : 8'hFF); |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| min_src2_1stage[4*i][8] = 1'b0; |
| min_src2_1stage[4*i+1][8] = 1'b0; |
| min_src2_1stage[4*i+2][8] = 1'b0; |
| min_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| min_src2_1stage[4*i][8] = 1'b0; |
| min_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+1][7] : 1'b0; |
| min_src2_1stage[4*i+2][8] = 1'b0; |
| min_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| min_src2_1stage[4*i][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i][7] : 1'b0; |
| min_src2_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+1][7] : 1'b0; |
| min_src2_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+2][7] : 1'b0; |
| min_src2_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_1stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // min_src1_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : ~8'hFF; |
| min_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : ~8'hFF; |
| min_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : ~8'hFF; |
| min_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| end |
| EEW16:begin |
| min_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : ~8'hFF; |
| min_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| min_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : ~8'hFF; |
| min_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| end |
| default:begin |
| min_src1_1stage[4*i][7:0] = pmtrdt_uop.vs2_type[8*i+4] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+4)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| min_src1_1stage[4*i+1][7:0] = pmtrdt_uop.vs2_type[8*i+5] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+5)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| min_src1_1stage[4*i+2][7:0] = pmtrdt_uop.vs2_type[8*i+6] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+6)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| min_src1_1stage[4*i+3][7:0] = pmtrdt_uop.vs2_type[8*i+7] == BODY_ACTIVE ? ~pmtrdt_uop.vs2_data[8*(8*i+7)+:8] : (rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF); |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_src1_1stage[4*i][8] = 1'b0; |
| min_src1_1stage[4*i+1][8] = 1'b0; |
| min_src1_1stage[4*i+2][8] = 1'b0; |
| min_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| EEW16:begin |
| min_src1_1stage[4*i][8] = 1'b0; |
| min_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+1][7] : ~1'b0; |
| min_src1_1stage[4*i+2][8] = 1'b0; |
| min_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| default:begin |
| min_src1_1stage[4*i][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i][7] : ~1'b0; |
| min_src1_1stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+1][7] : ~1'b0; |
| min_src1_1stage[4*i+2][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+2][7] : ~1'b0; |
| min_src1_1stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_1stage[4*i+3][7] : ~1'b0; |
| end |
| endcase |
| end |
| |
| // min_cin_1stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_cin_1stage[4*i] = 1'b1; |
| min_cin_1stage[4*i+1] = min_res_1stage[4*i][8]; |
| min_cin_1stage[4*i+2] = min_res_1stage[4*i+1][8]; |
| min_cin_1stage[4*i+3] = min_res_1stage[4*i+2][8]; |
| end |
| EEW16:begin |
| min_cin_1stage[4*i] = 1'b1; |
| min_cin_1stage[4*i+1] = min_res_1stage[4*i][8]; |
| min_cin_1stage[4*i+2] = 1'b1; |
| min_cin_1stage[4*i+3] = min_res_1stage[4*i+2][8]; |
| end |
| default:begin |
| min_cin_1stage[4*i] = 1'b1; |
| min_cin_1stage[4*i+1] = 1'b1; |
| min_cin_1stage[4*i+2] = 1'b1; |
| min_cin_1stage[4*i+3] = 1'b1; |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/(2*4); i++) begin : gen_rdt_min_src_1stage_data |
| |
| // `VLENB/2 9-bit-adder/and/or/xor for 1stage |
| for (i=0; i<`VLENB/2; i++) begin : gen_rdt_arithmetic_unit_1stage |
| assign sum_res_1stage[i] = sum_src2_1stage[i] + sum_src1_1stage[i] + sum_cin_1stage[i]; |
| assign max_res_1stage[i] = max_src2_1stage[i] + max_src1_1stage[i] + max_cin_1stage[i]; |
| assign min_res_1stage[i] = min_src2_1stage[i] + min_src1_1stage[i] + min_cin_1stage[i]; |
| assign and_1stage[i] = logic_src2_1stage[i] & logic_src1_1stage[i]; |
| assign or_1stage[i] = logic_src2_1stage[i] | logic_src1_1stage[i]; |
| assign xor_1stage[i] = logic_src2_1stage[i] ^ logic_src1_1stage[i]; |
| assign less_than_1stage[i] = min_res_1stage[i][8]; |
| assign great_than_1stage[i] = ~max_res_1stage[i][8]; |
| end |
| |
| // sum_src1_2stage/sum_src2_2stage/sum_cin_2stage data |
| for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_sum_src_2stage_data |
| // sum_src2_2stage data |
| always_comb begin |
| sum_src2_2stage[4*i][7:0] = sum_res_1stage[4*i][7:0]; |
| sum_src2_2stage[4*i+1][7:0] = sum_res_1stage[4*i+1][7:0]; |
| sum_src2_2stage[4*i+2][7:0] = sum_res_1stage[4*i+2][7:0]; |
| sum_src2_2stage[4*i+3][7:0] = sum_res_1stage[4*i+3][7:0]; |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| sum_src2_2stage[4*i][8] = 1'b0; |
| sum_src2_2stage[4*i+1][8] = 1'b0; |
| sum_src2_2stage[4*i+2][8] = 1'b0; |
| sum_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| sum_src2_2stage[4*i][8] = 1'b0; |
| sum_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+1][7] : 1'b0; |
| sum_src2_2stage[4*i+2][8] = 1'b0; |
| sum_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| sum_src2_2stage[4*i][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i][7] : 1'b0; |
| sum_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+1][7] : 1'b0; |
| sum_src2_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+2][7] : 1'b0; |
| sum_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| //sum_src1_2stage data |
| always_comb begin |
| sum_src1_2stage[4*i][7:0] = sum_res_1stage[`VLENB/4+4*i][7:0]; |
| sum_src1_2stage[4*i+1][7:0] = sum_res_1stage[`VLENB/4+4*i+1][7:0]; |
| sum_src1_2stage[4*i+2][7:0] = sum_res_1stage[`VLENB/4+4*i+2][7:0]; |
| sum_src1_2stage[4*i+3][7:0] = sum_res_1stage[`VLENB/4+4*i+3][7:0]; |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| sum_src1_2stage[4*i][8] = 1'b0; |
| sum_src1_2stage[4*i+1][8] = 1'b0; |
| sum_src1_2stage[4*i+2][8] = 1'b0; |
| sum_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| sum_src1_2stage[4*i][8] = 1'b0; |
| sum_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+1][7] : 1'b0; |
| sum_src1_2stage[4*i+2][8] = 1'b0; |
| sum_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| sum_src1_2stage[4*i][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i][7] : 1'b0; |
| sum_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+1][7] : 1'b0; |
| sum_src1_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+2][7] : 1'b0; |
| sum_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? sum_src1_2stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| //sum_cin_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| sum_cin_2stage[4*i] = 1'b0; |
| sum_cin_2stage[4*i+1] = sum_res_2stage[4*i][8]; |
| sum_cin_2stage[4*i+2] = sum_res_2stage[4*i+1][8]; |
| sum_cin_2stage[4*i+3] = sum_res_2stage[4*i+2][8]; |
| end |
| EEW16:begin |
| sum_cin_2stage[4*i] = 1'b0; |
| sum_cin_2stage[4*i+1] = sum_res_2stage[4*i][8]; |
| sum_cin_2stage[4*i+2] = 1'b0; |
| sum_cin_2stage[4*i+3] = sum_res_2stage[4*i+2][8]; |
| end |
| default:begin |
| sum_cin_2stage[4*i] = 1'b0; |
| sum_cin_2stage[4*i+1] = 1'b0; |
| sum_cin_2stage[4*i+2] = 1'b0; |
| sum_cin_2stage[4*i+3] = 1'b0; |
| end |
| endcase |
| end |
| end //end for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_sum_src_2stage_data |
| |
| // max_src1_2stage/max_src2_2stage/max_cin_2stage data |
| for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_max_src_2stage_data |
| // max_src2_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_src2_2stage[4*i][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i][7:0] : ~max_src1_1stage[4*i][7:0]; |
| max_src2_2stage[4*i+1][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+1][7:0] : ~max_src1_1stage[4*i+1][7:0]; |
| max_src2_2stage[4*i+2][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+2][7:0] : ~max_src1_1stage[4*i+2][7:0]; |
| max_src2_2stage[4*i+3][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+3][7:0] : ~max_src1_1stage[4*i+3][7:0]; |
| end |
| EEW16:begin |
| max_src2_2stage[4*i][7:0] = great_than_1stage[4*i+1] ? max_src2_1stage[4*i][7:0] : ~max_src1_1stage[4*i][7:0]; |
| max_src2_2stage[4*i+1][7:0] = great_than_1stage[4*i+1] ? max_src2_1stage[4*i+1][7:0] : ~max_src1_1stage[4*i+1][7:0]; |
| max_src2_2stage[4*i+2][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+2][7:0] : ~max_src1_1stage[4*i+2][7:0]; |
| max_src2_2stage[4*i+3][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+3][7:0] : ~max_src1_1stage[4*i+3][7:0]; |
| end |
| default:begin |
| max_src2_2stage[4*i][7:0] = great_than_1stage[4*i+0] ? max_src2_1stage[4*i][7:0] : ~max_src1_1stage[4*i][7:0]; |
| max_src2_2stage[4*i+1][7:0] = great_than_1stage[4*i+1] ? max_src2_1stage[4*i+1][7:0] : ~max_src1_1stage[4*i+1][7:0]; |
| max_src2_2stage[4*i+2][7:0] = great_than_1stage[4*i+2] ? max_src2_1stage[4*i+2][7:0] : ~max_src1_1stage[4*i+2][7:0]; |
| max_src2_2stage[4*i+3][7:0] = great_than_1stage[4*i+3] ? max_src2_1stage[4*i+3][7:0] : ~max_src1_1stage[4*i+3][7:0]; |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| max_src2_2stage[4*i][8] = 1'b0; |
| max_src2_2stage[4*i+1][8] = 1'b0; |
| max_src2_2stage[4*i+2][8] = 1'b0; |
| max_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| max_src2_2stage[4*i][8] = 1'b0; |
| max_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+1][7] : 1'b0; |
| max_src2_2stage[4*i+2][8] = 1'b0; |
| max_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| max_src2_2stage[4*i][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i][7] : 1'b0; |
| max_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+1][7] : 1'b0; |
| max_src2_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+2][7] : 1'b0; |
| max_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // max_src1_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_src1_2stage[4*i][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i][7:0] : max_src1_1stage[`VLENB/4+4*i][7:0]; |
| max_src1_2stage[4*i+1][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+1][7:0] : max_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| max_src1_2stage[4*i+2][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+2][7:0] : max_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| max_src1_2stage[4*i+3][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+3][7:0] : max_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| EEW16:begin |
| max_src1_2stage[4*i][7:0] = great_than_1stage[`VLENB/4+4*i+1] ? ~max_src2_1stage[`VLENB/4+4*i][7:0] : max_src1_1stage[`VLENB/4+4*i][7:0]; |
| max_src1_2stage[4*i+1][7:0] = great_than_1stage[`VLENB/4+4*i+1] ? ~max_src2_1stage[`VLENB/4+4*i+1][7:0] : max_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| max_src1_2stage[4*i+2][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+2][7:0] : max_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| max_src1_2stage[4*i+3][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+3][7:0] : max_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| default:begin |
| max_src1_2stage[4*i][7:0] = great_than_1stage[`VLENB/4+4*i+0] ? ~max_src2_1stage[`VLENB/4+4*i][7:0] : max_src1_1stage[`VLENB/4+4*i][7:0]; |
| max_src1_2stage[4*i+1][7:0] = great_than_1stage[`VLENB/4+4*i+1] ? ~max_src2_1stage[`VLENB/4+4*i+1][7:0] : max_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| max_src1_2stage[4*i+2][7:0] = great_than_1stage[`VLENB/4+4*i+2] ? ~max_src2_1stage[`VLENB/4+4*i+2][7:0] : max_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| max_src1_2stage[4*i+3][7:0] = great_than_1stage[`VLENB/4+4*i+3] ? ~max_src2_1stage[`VLENB/4+4*i+3][7:0] : max_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_src1_2stage[4*i][8] = 1'b0; |
| max_src1_2stage[4*i+1][8] = 1'b0; |
| max_src1_2stage[4*i+2][8] = 1'b0; |
| max_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| EEW16:begin |
| max_src1_2stage[4*i][8] = 1'b0; |
| max_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+1][7] : ~1'b0; |
| max_src1_2stage[4*i+2][8] = 1'b0; |
| max_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| default:begin |
| max_src1_2stage[4*i][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i][7] : ~1'b0; |
| max_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+1][7] : ~1'b0; |
| max_src1_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+2][7] : ~1'b0; |
| max_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? max_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| endcase |
| end |
| |
| // max_cin_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| max_cin_2stage[4*i] = 1'b1; |
| max_cin_2stage[4*i+1] = max_res_2stage[4*i][8]; |
| max_cin_2stage[4*i+2] = max_res_2stage[4*i+1][8]; |
| max_cin_2stage[4*i+3] = max_res_2stage[4*i+2][8]; |
| end |
| EEW16:begin |
| max_cin_2stage[4*i] = 1'b1; |
| max_cin_2stage[4*i+1] = max_res_2stage[4*i][8]; |
| max_cin_2stage[4*i+2] = 1'b1; |
| max_cin_2stage[4*i+3] = max_res_2stage[4*i+2][8]; |
| end |
| default:begin |
| max_cin_2stage[4*i] = 1'b1; |
| max_cin_2stage[4*i+1] = 1'b1; |
| max_cin_2stage[4*i+2] = 1'b1; |
| max_cin_2stage[4*i+3] = 1'b1; |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_max_src_2stage_data |
| |
| // min_src1_2stage/min_src2_2stage/min_cin_2stage data |
| for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_min_src_2stage_data |
| // min_src2_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_src2_2stage[4*i][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i][7:0] : ~min_src1_1stage[4*i][7:0]; |
| min_src2_2stage[4*i+1][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+1][7:0] : ~min_src1_1stage[4*i+1][7:0]; |
| min_src2_2stage[4*i+2][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+2][7:0] : ~min_src1_1stage[4*i+2][7:0]; |
| min_src2_2stage[4*i+3][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+3][7:0] : ~min_src1_1stage[4*i+3][7:0]; |
| end |
| EEW16:begin |
| min_src2_2stage[4*i][7:0] = less_than_1stage[4*i+1] ? min_src2_1stage[4*i][7:0] : ~min_src1_1stage[4*i][7:0]; |
| min_src2_2stage[4*i+1][7:0] = less_than_1stage[4*i+1] ? min_src2_1stage[4*i+1][7:0] : ~min_src1_1stage[4*i+1][7:0]; |
| min_src2_2stage[4*i+2][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+2][7:0] : ~min_src1_1stage[4*i+2][7:0]; |
| min_src2_2stage[4*i+3][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+3][7:0] : ~min_src1_1stage[4*i+3][7:0]; |
| end |
| default:begin |
| min_src2_2stage[4*i][7:0] = less_than_1stage[4*i+0] ? min_src2_1stage[4*i][7:0] : ~min_src1_1stage[4*i][7:0]; |
| min_src2_2stage[4*i+1][7:0] = less_than_1stage[4*i+1] ? min_src2_1stage[4*i+1][7:0] : ~min_src1_1stage[4*i+1][7:0]; |
| min_src2_2stage[4*i+2][7:0] = less_than_1stage[4*i+2] ? min_src2_1stage[4*i+2][7:0] : ~min_src1_1stage[4*i+2][7:0]; |
| min_src2_2stage[4*i+3][7:0] = less_than_1stage[4*i+3] ? min_src2_1stage[4*i+3][7:0] : ~min_src1_1stage[4*i+3][7:0]; |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| min_src2_2stage[4*i][8] = 1'b0; |
| min_src2_2stage[4*i+1][8] = 1'b0; |
| min_src2_2stage[4*i+2][8] = 1'b0; |
| min_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| EEW16:begin |
| min_src2_2stage[4*i][8] = 1'b0; |
| min_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+1][7] : 1'b0; |
| min_src2_2stage[4*i+2][8] = 1'b0; |
| min_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| default:begin |
| min_src2_2stage[4*i][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i][7] : 1'b0; |
| min_src2_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+1][7] : 1'b0; |
| min_src2_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+2][7] : 1'b0; |
| min_src2_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src2_2stage[4*i+3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // min_src1_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_src1_2stage[4*i][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i][7:0] : min_src1_1stage[`VLENB/4+4*i][7:0]; |
| min_src1_2stage[4*i+1][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+1][7:0] : min_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| min_src1_2stage[4*i+2][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+2][7:0] : min_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| min_src1_2stage[4*i+3][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+3][7:0] : min_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| EEW16:begin |
| min_src1_2stage[4*i][7:0] = less_than_1stage[`VLENB/4+4*i+1] ? ~min_src2_1stage[`VLENB/4+4*i][7:0] : min_src1_1stage[`VLENB/4+4*i][7:0]; |
| min_src1_2stage[4*i+1][7:0] = less_than_1stage[`VLENB/4+4*i+1] ? ~min_src2_1stage[`VLENB/4+4*i+1][7:0] : min_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| min_src1_2stage[4*i+2][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+2][7:0] : min_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| min_src1_2stage[4*i+3][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+3][7:0] : min_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| default:begin |
| min_src1_2stage[4*i][7:0] = less_than_1stage[`VLENB/4+4*i+0] ? ~min_src2_1stage[`VLENB/4+4*i][7:0] : min_src1_1stage[`VLENB/4+4*i][7:0]; |
| min_src1_2stage[4*i+1][7:0] = less_than_1stage[`VLENB/4+4*i+1] ? ~min_src2_1stage[`VLENB/4+4*i+1][7:0] : min_src1_1stage[`VLENB/4+4*i+1][7:0]; |
| min_src1_2stage[4*i+2][7:0] = less_than_1stage[`VLENB/4+4*i+2] ? ~min_src2_1stage[`VLENB/4+4*i+2][7:0] : min_src1_1stage[`VLENB/4+4*i+2][7:0]; |
| min_src1_2stage[4*i+3][7:0] = less_than_1stage[`VLENB/4+4*i+3] ? ~min_src2_1stage[`VLENB/4+4*i+3][7:0] : min_src1_1stage[`VLENB/4+4*i+3][7:0]; |
| end |
| endcase |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_src1_2stage[4*i][8] = 1'b0; |
| min_src1_2stage[4*i+1][8] = 1'b0; |
| min_src1_2stage[4*i+2][8] = 1'b0; |
| min_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| EEW16:begin |
| min_src1_2stage[4*i][8] = 1'b0; |
| min_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+1][7] : ~1'b0; |
| min_src1_2stage[4*i+2][8] = 1'b0; |
| min_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| default:begin |
| min_src1_2stage[4*i][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i][7] : ~1'b0; |
| min_src1_2stage[4*i+1][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+1][7] : ~1'b0; |
| min_src1_2stage[4*i+2][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+2][7] : ~1'b0; |
| min_src1_2stage[4*i+3][8] = rdt_ctrl.sign_opr ? min_src1_2stage[4*i+3][7] : ~1'b0; |
| end |
| endcase |
| end |
| |
| // min_cin_2stage data |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| min_cin_2stage[4*i] = 1'b1; |
| min_cin_2stage[4*i+1] = min_res_2stage[4*i][8]; |
| min_cin_2stage[4*i+2] = min_res_2stage[4*i+1][8]; |
| min_cin_2stage[4*i+3] = min_res_2stage[4*i+2][8]; |
| end |
| EEW16:begin |
| min_cin_2stage[4*i] = 1'b1; |
| min_cin_2stage[4*i+1] = min_res_2stage[4*i][8]; |
| min_cin_2stage[4*i+2] = 1'b1; |
| min_cin_2stage[4*i+3] = min_res_2stage[4*i+2][8]; |
| end |
| default:begin |
| min_cin_2stage[4*i] = 1'b1; |
| min_cin_2stage[4*i+1] = 1'b1; |
| min_cin_2stage[4*i+2] = 1'b1; |
| min_cin_2stage[4*i+3] = 1'b1; |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/(4*4); i++) begin : gen_rdt_min_src_2stage_data |
| |
| // `VLENB/4 9-bit-adder/and/or/xor for 2stage |
| for (i=0; i<`VLENB/4; i++) begin : gen_rdt_arithmetic_unit_2stage |
| assign sum_res_2stage[i] = sum_src2_2stage[i] + sum_src1_2stage[i] + sum_cin_2stage[i]; |
| assign max_res_2stage[i] = max_src2_2stage[i] + max_src1_2stage[i] + max_cin_2stage[i]; |
| assign min_res_2stage[i] = min_src2_2stage[i] + min_src1_2stage[i] + min_cin_2stage[i]; |
| assign less_than_2stage[i] = min_res_2stage[i][8]; |
| assign great_than_2stage[i] = ~max_res_2stage[i][8]; |
| end |
| for (i=0; i<`VLENB/(4*4); i++) begin: gen_rdt_logic_unit_2stage |
| assign and_2stage[4*i] = and_1stage[8*i] & and_1stage[8*i+4]; |
| assign and_2stage[4*i+1] = and_1stage[8*i+1] & and_1stage[8*i+5]; |
| assign and_2stage[4*i+2] = and_1stage[8*i+2] & and_1stage[8*i+6]; |
| assign and_2stage[4*i+3] = and_1stage[8*i+3] & and_1stage[8*i+7]; |
| assign or_2stage[4*i] = or_1stage[8*i] | or_1stage[8*i+4]; |
| assign or_2stage[4*i+1] = or_1stage[8*i+1] | or_1stage[8*i+5]; |
| assign or_2stage[4*i+2] = or_1stage[8*i+2] | or_1stage[8*i+6]; |
| assign or_2stage[4*i+3] = or_1stage[8*i+3] | or_1stage[8*i+7]; |
| assign xor_2stage[4*i] = xor_1stage[8*i] ^ xor_1stage[8*i+4]; |
| assign xor_2stage[4*i+1] = xor_1stage[8*i+1] ^ xor_1stage[8*i+5]; |
| assign xor_2stage[4*i+2] = xor_1stage[8*i+2] ^ xor_1stage[8*i+6]; |
| assign xor_2stage[4*i+3] = xor_1stage[8*i+3] ^ xor_1stage[8*i+7]; |
| end |
| |
| // red_res_ex1 & red_vs1_ex1 operation for reduction |
| // src1_vd_1stage/src2_vs1_1stage/carry_in_vd_1stage data |
| // src2_vs1_1stage |
| always_comb begin |
| case (rdt_ctrl_q.rdt_opr) |
| MAX:begin |
| src2_vs1_1stage[0][7:0] = max_vs1_ex1[0][7:0]; |
| src2_vs1_1stage[1][7:0] = max_vs1_ex1[1][7:0]; |
| src2_vs1_1stage[2][7:0] = max_vs1_ex1[2][7:0]; |
| src2_vs1_1stage[3][7:0] = max_vs1_ex1[3][7:0]; |
| end |
| MIN:begin |
| src2_vs1_1stage[0][7:0] = min_vs1_ex1[0][7:0]; |
| src2_vs1_1stage[1][7:0] = min_vs1_ex1[1][7:0]; |
| src2_vs1_1stage[2][7:0] = min_vs1_ex1[2][7:0]; |
| src2_vs1_1stage[3][7:0] = min_vs1_ex1[3][7:0]; |
| end |
| default:begin |
| src2_vs1_1stage[0][7:0] = sum_vs1_ex1[0][7:0]; |
| src2_vs1_1stage[1][7:0] = sum_vs1_ex1[1][7:0]; |
| src2_vs1_1stage[2][7:0] = sum_vs1_ex1[2][7:0]; |
| src2_vs1_1stage[3][7:0] = sum_vs1_ex1[3][7:0]; |
| end |
| endcase |
| case (rdt_ctrl_q.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| src2_vs1_1stage[0][8] = 1'b0; |
| src2_vs1_1stage[1][8] = 1'b0; |
| src2_vs1_1stage[2][8] = 1'b0; |
| src2_vs1_1stage[3][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[3][7] : 1'b0; |
| end |
| EEW16:begin |
| src2_vs1_1stage[0][8] = 1'b0; |
| src2_vs1_1stage[1][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[1][7] : 1'b0; |
| src2_vs1_1stage[2][8] = 1'b0; |
| src2_vs1_1stage[3][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[3][7] : 1'b0; |
| end |
| default:begin |
| src2_vs1_1stage[0][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[0][7] : 1'b0; |
| src2_vs1_1stage[1][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[1][7] : 1'b0; |
| src2_vs1_1stage[2][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[2][7] : 1'b0; |
| src2_vs1_1stage[3][8] = rdt_ctrl_q.sign_opr ? src2_vs1_1stage[3][7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // src1_vd_1stage data |
| always_comb begin |
| case (rdt_ctrl_q.rdt_opr) |
| MAX:begin |
| src1_vd_1stage[0][7:0] = ~max_res_ex1[0][7:0]; |
| src1_vd_1stage[1][7:0] = ~max_res_ex1[1][7:0]; |
| src1_vd_1stage[2][7:0] = ~max_res_ex1[2][7:0]; |
| src1_vd_1stage[3][7:0] = ~max_res_ex1[3][7:0]; |
| end |
| MIN:begin |
| src1_vd_1stage[0][7:0] = ~min_res_ex1[0][7:0]; |
| src1_vd_1stage[1][7:0] = ~min_res_ex1[1][7:0]; |
| src1_vd_1stage[2][7:0] = ~min_res_ex1[2][7:0]; |
| src1_vd_1stage[3][7:0] = ~min_res_ex1[3][7:0]; |
| end |
| default:begin |
| src1_vd_1stage[0][7:0] = sum_res_ex1[0][7:0]; |
| src1_vd_1stage[1][7:0] = sum_res_ex1[1][7:0]; |
| src1_vd_1stage[2][7:0] = sum_res_ex1[2][7:0]; |
| src1_vd_1stage[3][7:0] = sum_res_ex1[3][7:0]; |
| end |
| endcase |
| case (rdt_ctrl_q.rdt_opr) |
| MAX, |
| MIN:begin |
| case (rdt_ctrl_q.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| src1_vd_1stage[0][8] = 1'b0; |
| src1_vd_1stage[1][8] = 1'b0; |
| src1_vd_1stage[2][8] = 1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : ~1'b0; |
| end |
| EEW16:begin |
| src1_vd_1stage[0][8] = 1'b0; |
| src1_vd_1stage[1][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[1][7] : ~1'b0; |
| src1_vd_1stage[2][8] = 1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : ~1'b0; |
| end |
| default:begin |
| src1_vd_1stage[0][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[0][7] : ~1'b0; |
| src1_vd_1stage[1][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[1][7] : ~1'b0; |
| src1_vd_1stage[2][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[2][7] : ~1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : ~1'b0; |
| end |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl_q.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| src1_vd_1stage[0][8] = 1'b0; |
| src1_vd_1stage[1][8] = 1'b0; |
| src1_vd_1stage[2][8] = 1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : 1'b0; |
| end |
| EEW16:begin |
| src1_vd_1stage[0][8] = 1'b0; |
| src1_vd_1stage[1][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[1][7] : 1'b0; |
| src1_vd_1stage[2][8] = 1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : 1'b0; |
| end |
| default:begin |
| src1_vd_1stage[0][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[0][7] : 1'b0; |
| src1_vd_1stage[1][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[1][7] : 1'b0; |
| src1_vd_1stage[2][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[2][7] : 1'b0; |
| src1_vd_1stage[3][8] = rdt_ctrl_q.sign_opr ? src1_vd_1stage[3][7] : 1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // carry_in_vd_1stage data |
| always_comb begin |
| case (rdt_ctrl_q.rdt_opr) |
| MAX, |
| MIN:begin |
| case (rdt_ctrl_q.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| carry_in_vd_1stage[0] = 1'b1; |
| carry_in_vd_1stage[1] = sum_vd_1stage[0][8]; |
| carry_in_vd_1stage[2] = sum_vd_1stage[1][8]; |
| carry_in_vd_1stage[3] = sum_vd_1stage[2][8]; |
| end |
| EEW16:begin |
| carry_in_vd_1stage[0] = 1'b1; |
| carry_in_vd_1stage[1] = sum_vd_1stage[0][8]; |
| carry_in_vd_1stage[2] = 1'b1; |
| carry_in_vd_1stage[3] = sum_vd_1stage[2][8]; |
| end |
| default:begin |
| carry_in_vd_1stage[0] = 1'b1; |
| carry_in_vd_1stage[1] = 1'b1; |
| carry_in_vd_1stage[2] = 1'b1; |
| carry_in_vd_1stage[3] = 1'b1; |
| end |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl_q.vs1_eew) // Reduction instruction: widen_vs2_eew == vs1_eew |
| EEW32:begin |
| carry_in_vd_1stage[0] = 1'b0; |
| carry_in_vd_1stage[1] = sum_vd_1stage[0][8]; |
| carry_in_vd_1stage[2] = sum_vd_1stage[1][8]; |
| carry_in_vd_1stage[3] = sum_vd_1stage[2][8]; |
| end |
| EEW16:begin |
| carry_in_vd_1stage[0] = 1'b0; |
| carry_in_vd_1stage[1] = sum_vd_1stage[0][8]; |
| carry_in_vd_1stage[2] = 1'b0; |
| carry_in_vd_1stage[3] = sum_vd_1stage[2][8]; |
| end |
| default:begin |
| carry_in_vd_1stage[0] = 1'b0; |
| carry_in_vd_1stage[1] = 1'b0; |
| carry_in_vd_1stage[2] = 1'b0; |
| carry_in_vd_1stage[3] = 1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // four 9-bit adder/and/or/xor for red_res_q & red_vs1_q |
| for (i=0; i<4; i++) begin : gen_rdt_arithmetic_unit_vs1vd_1stage |
| assign sum_vd_1stage[i] = src2_vs1_1stage[i] + src1_vd_1stage[i] + carry_in_vd_1stage[i]; |
| assign and_vd_1stage[i] = and_vs1_ex1[i] & and_res_ex1[i]; |
| assign or_vd_1stage[i] = or_vs1_ex1[i] | or_res_ex1[i]; |
| assign xor_vd_1stage[i] = xor_vs1_ex1[i] ^ xor_res_ex1[i]; |
| assign less_than_vd_1stage[i] = sum_vd_1stage[i][8]; |
| assign great_than_vd_1stage[i] = ~sum_vd_1stage[i][8]; |
| end |
| |
| // VS1[0] & res_vd_1stage[0] operation for reduction |
| // src1_vd_2stage/src2_vs1_2stage/carry_in_vd_2stage data |
| // src2_vs1_2stage |
| assign sel_vs1 = rdt_ctrl.last_uop_valid && !rdt_ctrl.widen || |
| rdt_ctrl.last_uop_valid && rdt_ctrl.widen && red_widen_sum_flag; |
| always_comb begin |
| if (sel_vs1) begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: begin |
| src2_vs1_2stage[0][7:0] = pmtrdt_uop.vs1_data[8*0+:8]; |
| src2_vs1_2stage[1][7:0] = pmtrdt_uop.vs1_data[8*1+:8]; |
| src2_vs1_2stage[2][7:0] = pmtrdt_uop.vs1_data[8*2+:8]; |
| src2_vs1_2stage[3][7:0] = pmtrdt_uop.vs1_data[8*3+:8]; |
| src2_vs1_2stage[0][8] = 1'b0; |
| src2_vs1_2stage[1][8] = 1'b0; |
| src2_vs1_2stage[2][8] = 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| EEW16: begin |
| src2_vs1_2stage[0][7:0] = pmtrdt_uop.vs1_data[8*0+:8]; |
| src2_vs1_2stage[1][7:0] = pmtrdt_uop.vs1_data[8*1+:8]; |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| end |
| MIN:begin |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| end |
| AND:begin |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| src2_vs1_2stage[0][8] = 1'b0; |
| src2_vs1_2stage[1][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[1][7] : 1'b0; |
| src2_vs1_2stage[2][8] = 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| default: begin |
| src2_vs1_2stage[0][7:0] = pmtrdt_uop.vs1_data[0+:8]; |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[2][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| end |
| MIN:begin |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[2][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| end |
| AND:begin |
| src2_vs1_2stage[1][7:0] = 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src2_vs1_2stage[1][7:0] = 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| src2_vs1_2stage[0][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[0][7] : 1'b0; |
| src2_vs1_2stage[1][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[1][7] : 1'b0; |
| src2_vs1_2stage[2][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[2][7] : 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| endcase |
| end else begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| src2_vs1_2stage[0][7:0] = 8'h00; |
| src2_vs1_2stage[1][7:0] = 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| end |
| MIN:begin |
| src2_vs1_2stage[0][7:0] = 8'hFF; |
| src2_vs1_2stage[1][7:0] = 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| end |
| AND:begin |
| src2_vs1_2stage[0][7:0] = 8'hFF; |
| src2_vs1_2stage[1][7:0] = 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src2_vs1_2stage[0][7:0] = 8'h00; |
| src2_vs1_2stage[1][7:0] = 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| src2_vs1_2stage[0][8] = 1'b0; |
| src2_vs1_2stage[1][8] = 1'b0; |
| src2_vs1_2stage[2][8] = 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| EEW16:begin |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| src2_vs1_2stage[0][7:0] = 8'h00; |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| end |
| MIN:begin |
| src2_vs1_2stage[0][7:0] = 8'hFF; |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| end |
| AND:begin |
| src2_vs1_2stage[0][7:0] = 8'hFF; |
| src2_vs1_2stage[1][7:0] = 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src2_vs1_2stage[0][7:0] = 8'h00; |
| src2_vs1_2stage[1][7:0] = 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| src2_vs1_2stage[0][8] = 1'b0; |
| src2_vs1_2stage[1][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[1][7] : 1'b0; |
| src2_vs1_2stage[2][8] = 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| default:begin |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| src2_vs1_2stage[0][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[2][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h80 : 8'h00; |
| end |
| MIN:begin |
| src2_vs1_2stage[0][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[1][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[2][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| src2_vs1_2stage[3][7:0] = rdt_ctrl.sign_opr ? 8'h7F : 8'hFF; |
| end |
| AND:begin |
| src2_vs1_2stage[0][7:0] = 8'hFF; |
| src2_vs1_2stage[1][7:0] = 8'hFF; |
| src2_vs1_2stage[2][7:0] = 8'hFF; |
| src2_vs1_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src2_vs1_2stage[0][7:0] = 8'h00; |
| src2_vs1_2stage[1][7:0] = 8'h00; |
| src2_vs1_2stage[2][7:0] = 8'h00; |
| src2_vs1_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| src2_vs1_2stage[0][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[0][7] : 1'b0; |
| src2_vs1_2stage[1][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[1][7] : 1'b0; |
| src2_vs1_2stage[2][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[2][7] : 1'b0; |
| src2_vs1_2stage[3][8] = rdt_ctrl.sign_opr ? src2_vs1_2stage[3][7] : 1'b0; |
| end |
| endcase |
| end |
| end |
| |
| // src1_vd_2stage data |
| always_comb begin |
| if (pmtrdt_uop.first_uop_valid && !red_widen_sum_flag) begin |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][7:0] = ~8'h00; |
| src1_vd_2stage[1][7:0] = ~8'h00; |
| src1_vd_2stage[2][7:0] = ~8'h00; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][7:0] = ~8'h00; |
| src1_vd_2stage[1][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| src1_vd_2stage[2][7:0] = ~8'h00; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| src1_vd_2stage[1][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| src1_vd_2stage[2][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h80 : ~8'h00; |
| end |
| endcase |
| end |
| MIN:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][7:0] = ~8'hFF; |
| src1_vd_2stage[1][7:0] = ~8'hFF; |
| src1_vd_2stage[2][7:0] = ~8'hFF; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][7:0] = ~8'hFF; |
| src1_vd_2stage[1][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| src1_vd_2stage[2][7:0] = ~8'hFF; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| src1_vd_2stage[1][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| src1_vd_2stage[2][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| src1_vd_2stage[3][7:0] = rdt_ctrl.sign_opr ? ~8'h7F : ~8'hFF; |
| end |
| endcase |
| end |
| AND:begin |
| src1_vd_2stage[0][7:0] = 8'hFF; |
| src1_vd_2stage[1][7:0] = 8'hFF; |
| src1_vd_2stage[2][7:0] = 8'hFF; |
| src1_vd_2stage[3][7:0] = 8'hFF; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = 8'h00; |
| src1_vd_2stage[1][7:0] = 8'h00; |
| src1_vd_2stage[2][7:0] = 8'h00; |
| src1_vd_2stage[3][7:0] = 8'h00; |
| end |
| endcase |
| end else begin |
| case (rdt_ctrl.rdt_opr) |
| MAX:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][7:0] = great_than_vd_1stage[1] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = great_than_vd_1stage[1] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = great_than_vd_1stage[0] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = great_than_vd_1stage[1] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = great_than_vd_1stage[2] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = great_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| endcase |
| end |
| MIN:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][7:0] = less_than_vd_1stage[1] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = less_than_vd_1stage[1] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = less_than_vd_1stage[0] ? ~src2_vs1_1stage[0][7:0] : src1_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = less_than_vd_1stage[1] ? ~src2_vs1_1stage[1][7:0] : src1_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = less_than_vd_1stage[2] ? ~src2_vs1_1stage[2][7:0] : src1_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = less_than_vd_1stage[3] ? ~src2_vs1_1stage[3][7:0] : src1_vd_1stage[3][7:0]; |
| end |
| endcase |
| end |
| AND:begin |
| src1_vd_2stage[0][7:0] = and_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = and_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = and_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = and_vd_1stage[3][7:0]; |
| end |
| OR:begin |
| src1_vd_2stage[0][7:0] = or_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = or_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = or_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = or_vd_1stage[3][7:0]; |
| end |
| XOR:begin |
| src1_vd_2stage[0][7:0] = xor_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = xor_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = xor_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = xor_vd_1stage[3][7:0]; |
| end |
| default:begin |
| src1_vd_2stage[0][7:0] = sum_vd_1stage[0][7:0]; |
| src1_vd_2stage[1][7:0] = sum_vd_1stage[1][7:0]; |
| src1_vd_2stage[2][7:0] = sum_vd_1stage[2][7:0]; |
| src1_vd_2stage[3][7:0] = sum_vd_1stage[3][7:0]; |
| end |
| endcase |
| end |
| case (rdt_ctrl.rdt_opr) |
| MAX, |
| MIN:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][8] = 1'b0; |
| src1_vd_2stage[1][8] = 1'b0; |
| src1_vd_2stage[2][8] = 1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : ~1'b0; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][8] = 1'b0; |
| src1_vd_2stage[1][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[1][7] : ~1'b0; |
| src1_vd_2stage[2][8] = 1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : ~1'b0; |
| end |
| default:begin |
| src1_vd_2stage[0][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[0][7] : ~1'b0; |
| src1_vd_2stage[1][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[1][7] : ~1'b0; |
| src1_vd_2stage[2][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[2][7] : ~1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : ~1'b0; |
| end |
| endcase |
| end |
| default:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| src1_vd_2stage[0][8] = 1'b0; |
| src1_vd_2stage[1][8] = 1'b0; |
| src1_vd_2stage[2][8] = 1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : 1'b0; |
| end |
| EEW16:begin |
| src1_vd_2stage[0][8] = 1'b0; |
| src1_vd_2stage[1][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[1][7] : 1'b0; |
| src1_vd_2stage[2][8] = 1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : 1'b0; |
| end |
| default:begin |
| src1_vd_2stage[0][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[0][7] : 1'b0; |
| src1_vd_2stage[1][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[1][7] : 1'b0; |
| src1_vd_2stage[2][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[2][7] : 1'b0; |
| src1_vd_2stage[3][8] = rdt_ctrl.sign_opr ? src1_vd_2stage[3][7] : 1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // carry_in_vd_2stage data |
| always_comb begin |
| case (rdt_ctrl.rdt_opr) |
| MAX, |
| MIN:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| carry_in_vd_2stage[0] = 1'b1; |
| carry_in_vd_2stage[1] = sum_vd_2stage[0][8]; |
| carry_in_vd_2stage[2] = sum_vd_2stage[1][8]; |
| carry_in_vd_2stage[3] = sum_vd_2stage[2][8]; |
| end |
| EEW16:begin |
| carry_in_vd_2stage[0] = 1'b1; |
| carry_in_vd_2stage[1] = sum_vd_2stage[0][8]; |
| carry_in_vd_2stage[2] = 1'b1; |
| carry_in_vd_2stage[3] = sum_vd_2stage[2][8]; |
| end |
| default:begin |
| carry_in_vd_2stage[0] = 1'b1; |
| carry_in_vd_2stage[1] = 1'b1; |
| carry_in_vd_2stage[2] = 1'b1; |
| carry_in_vd_2stage[3] = 1'b1; |
| end |
| endcase |
| end |
| default:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32:begin |
| carry_in_vd_2stage[0] = 1'b0; |
| carry_in_vd_2stage[1] = sum_vd_2stage[0][8]; |
| carry_in_vd_2stage[2] = sum_vd_2stage[1][8]; |
| carry_in_vd_2stage[3] = sum_vd_2stage[2][8]; |
| end |
| EEW16:begin |
| carry_in_vd_2stage[0] = 1'b0; |
| carry_in_vd_2stage[1] = sum_vd_2stage[0][8]; |
| carry_in_vd_2stage[2] = 1'b0; |
| carry_in_vd_2stage[3] = sum_vd_2stage[2][8]; |
| end |
| default:begin |
| carry_in_vd_2stage[0] = 1'b0; |
| carry_in_vd_2stage[1] = 1'b0; |
| carry_in_vd_2stage[2] = 1'b0; |
| carry_in_vd_2stage[3] = 1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // four 9-bit-adder/and/or/xor for vs1[0] & vd[0] |
| for (i=0; i<4; i++) begin : gen_rdt_arithmetic_unit_vs1vd_2stage |
| assign sum_vd_2stage[i] = src2_vs1_2stage[i] + src1_vd_2stage[i] + carry_in_vd_2stage[i]; |
| assign and_vd_2stage[i] = src2_vs1_2stage[i][7:0] & src1_vd_2stage[i][7:0]; |
| assign or_vd_2stage[i] = src2_vs1_2stage[i][7:0] | src1_vd_2stage[i][7:0]; |
| assign xor_vd_2stage[i] = src2_vs1_2stage[i][7:0] ^ src1_vd_2stage[i][7:0]; |
| assign less_than_vd_2stage[i] = sum_vd_2stage[i][8]; |
| assign great_than_vd_2stage[i] = ~sum_vd_2stage[i][8]; |
| end |
| assign red_res_en = pmtrdt_uop_valid & (pmtrdt_uop_ready | !red_widen_sum_flag); |
| |
| for (i=0; i<`VLENB/4; i++) begin : gen_reduction_result |
| // max_res_ex0/min_res_ex0 based on vs1_eew |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: max_res_ex0[i] = great_than_2stage[4*(i/4)+3] ? max_src2_2stage[i][7:0] : ~max_src1_2stage[i][7:0]; |
| EEW16: max_res_ex0[i] = great_than_2stage[2*(i/2)+1] ? max_src2_2stage[i][7:0] : ~max_src1_2stage[i][7:0]; |
| default: max_res_ex0[i] = great_than_2stage[i] ? max_src2_2stage[i][7:0] : ~max_src1_2stage[i][7:0]; |
| endcase |
| end |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: min_res_ex0[i] = less_than_2stage[4*(i/4)+3] ? min_src2_2stage[i][7:0] : ~min_src1_2stage[i][7:0]; |
| EEW16: min_res_ex0[i] = less_than_2stage[2*(i/2)+1] ? min_src2_2stage[i][7:0] : ~min_src1_2stage[i][7:0]; |
| default: min_res_ex0[i] = less_than_2stage[i] ? min_src2_2stage[i][7:0] : ~min_src1_2stage[i][7:0]; |
| endcase |
| end |
| |
| edff #(.T(logic[7:0])) sum_res_reg (.q(sum_res_ex1[i]), .d(sum_res_2stage[i][7:0]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) max_res_reg (.q(max_res_ex1[i]), .d(max_res_ex0[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) min_res_reg (.q(min_res_ex1[i]), .d(min_res_ex0[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) and_res_reg (.q(and_res_ex1[i]), .d(and_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) or_res_reg (.q(or_res_ex1[i]), .d(or_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) xor_res_reg (.q(xor_res_ex1[i]), .d(xor_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| |
| // max_vs1_ex0/min_vs1_ex0 based on vs1_eew |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: max_vs1_ex0[i] = great_than_vd_2stage[4*(i/4)+3] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| EEW16: max_vs1_ex0[i] = great_than_vd_2stage[2*(i/2)+1] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| default: max_vs1_ex0[i] = great_than_vd_2stage[i] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| endcase |
| end |
| always_comb begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: min_vs1_ex0[i] = less_than_vd_2stage[4*(i/4)+3] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| EEW16: min_vs1_ex0[i] = less_than_vd_2stage[2*(i/2)+1] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| default: min_vs1_ex0[i] = less_than_vd_2stage[i] ? src2_vs1_2stage[i][7:0] : ~src1_vd_2stage[i][7:0]; |
| endcase |
| end |
| |
| edff #(.T(logic[7:0])) sum_vs1_reg (.q(sum_vs1_ex1[i]), .d(sum_vd_2stage[i][7:0]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) max_vs1_reg (.q(max_vs1_ex1[i]), .d(max_vs1_ex0[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) min_vs1_reg (.q(min_vs1_ex1[i]), .d(min_vs1_ex0[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) and_vs1_reg (.q(and_vs1_ex1[i]), .d(and_vd_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) or_vs1_reg (.q(or_vs1_ex1[i]), .d(or_vd_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| edff #(.T(logic[7:0])) xor_vs1_reg (.q(xor_vs1_ex1[i]), .d(xor_vd_2stage[i]), .e(red_res_en), .clk(clk), .rst_n(rst_n)); |
| end |
| |
| // reduction result when vd_eew is 32b |
| for (i=0; i<4; i++) begin |
| assign sum_32b[8*i+:8] = sum_vd_1stage[i][7:0]; |
| assign max_32b[8*i+:8] = great_than_vd_1stage[3] ? src2_vs1_1stage[i][7:0] : ~src1_vd_1stage[i][7:0]; |
| assign min_32b[8*i+:8] = less_than_vd_1stage[3] ? src2_vs1_1stage[i][7:0] : ~src1_vd_1stage[i][7:0]; |
| assign and_32b[8*i+:8] = and_vd_1stage[i]; |
| assign or_32b[8*i+:8] = or_vd_1stage[i]; |
| assign xor_32b[8*i+:8] = xor_vd_1stage[i]; |
| end |
| |
| // reduction result when vd_eew is 16b |
| assign sum_16b = sum_32b[31:16] + sum_32b[15:0]; |
| for (i=0; i<2; i++) begin |
| assign max_16b_1stage[i][7:0] = great_than_vd_1stage[2*i+1] ? src2_vs1_1stage[2*i][7:0] : ~src1_vd_1stage[2*i][7:0]; |
| assign max_16b_1stage[i][15:8] = great_than_vd_1stage[2*i+1] ? src2_vs1_1stage[2*i+1][7:0] : ~src1_vd_1stage[2*i+1][7:0]; |
| assign min_16b_1stage[i][7:0] = less_than_vd_1stage[2*i+1] ? src2_vs1_1stage[2*i][7:0] : ~src1_vd_1stage[2*i][7:0]; |
| assign min_16b_1stage[i][15:8] = less_than_vd_1stage[2*i+1] ? src2_vs1_1stage[2*i+1][7:0] : ~src1_vd_1stage[2*i+1][7:0]; |
| end |
| assign and_16b = and_32b[31:16] & and_32b[15:0]; |
| assign or_16b = or_32b[31:16] | or_32b[15:0]; |
| assign xor_16b = xor_32b[31:16] ^ xor_32b[15:0]; |
| always_comb begin |
| if (rdt_ctrl_q.sign_opr) begin |
| max_16b = $signed(max_16b_1stage[0]) > $signed(max_16b_1stage[1]) |
| ? max_16b_1stage[0] : max_16b_1stage[1]; |
| min_16b = $signed(min_16b_1stage[0]) < $signed(min_16b_1stage[1]) |
| ? min_16b_1stage[0] : min_16b_1stage[1]; |
| end else begin |
| max_16b = max_16b_1stage[0] > max_16b_1stage[1] |
| ? max_16b_1stage[0] : max_16b_1stage[1]; |
| min_16b = min_16b_1stage[0] < min_16b_1stage[1] |
| ? min_16b_1stage[0] : min_16b_1stage[1]; |
| end |
| end |
| |
| // reduction result when vd_eew is 8b |
| assign sum_8b = sum_32b[31:24] + sum_32b[23:16] + sum_32b[15:8] + sum_32b[7:0]; |
| for (i=0; i<4; i++) begin |
| assign max_8b_1stage[i] = great_than_vd_1stage[i] ? src2_vs1_1stage[i][7:0] : ~src1_vd_1stage[i][7:0]; |
| assign min_8b_1stage[i] = less_than_vd_1stage[i] ? src2_vs1_1stage[i][7:0] : ~src1_vd_1stage[i][7:0]; |
| end |
| assign and_8b = and_16b[15:8] & and_16b[7:0]; |
| assign or_8b = or_16b[15:8] | or_16b[7:0]; |
| assign xor_8b = xor_16b[15:8] ^ xor_16b[7:0]; |
| always_comb begin |
| if (rdt_ctrl_q.sign_opr) begin |
| max_8b = 8'h80; |
| min_8b = 8'h7F; |
| end else begin |
| max_8b = 8'h00; |
| min_8b = 8'hFF; |
| end |
| for (int j=0; j<4; j++) begin |
| if (rdt_ctrl_q.sign_opr) begin |
| max_8b = $signed(max_8b) > $signed(max_8b_1stage[j]) |
| ? max_8b : max_8b_1stage[j]; |
| min_8b = $signed(min_8b) < $signed(min_8b_1stage[j]) |
| ? min_8b : min_8b_1stage[j]; |
| end else begin |
| max_8b = max_8b > max_8b_1stage[j] |
| ? max_8b : max_8b_1stage[j]; |
| min_8b = min_8b < min_8b_1stage[j] |
| ? min_8b : min_8b_1stage[j]; |
| end |
| end |
| end |
| |
| //pmtrdt_res_red data |
| always_comb begin |
| case (rdt_ctrl_q.vs1_eew) |
| EEW32:begin |
| case (rdt_ctrl_q.rdt_opr) |
| SUM: pmtrdt_res_red = {{(`VLEN-32){1'b0}},sum_32b}; |
| MAX: pmtrdt_res_red = {{(`VLEN-32){1'b0}},max_32b}; |
| MIN: pmtrdt_res_red = {{(`VLEN-32){1'b0}},min_32b}; |
| AND: pmtrdt_res_red = {{(`VLEN-32){1'b0}},and_32b}; |
| OR: pmtrdt_res_red = {{(`VLEN-32){1'b0}},or_32b}; |
| XOR: pmtrdt_res_red = {{(`VLEN-32){1'b0}},xor_32b}; |
| default: pmtrdt_res_red = '0; |
| endcase |
| end |
| EEW16:begin |
| case (rdt_ctrl_q.rdt_opr) |
| SUM: pmtrdt_res_red = {{(`VLEN-16){1'b0}},sum_16b}; |
| MAX: pmtrdt_res_red = {{(`VLEN-16){1'b0}},max_16b}; |
| MIN: pmtrdt_res_red = {{(`VLEN-16){1'b0}},min_16b}; |
| AND: pmtrdt_res_red = {{(`VLEN-16){1'b0}},and_16b}; |
| OR: pmtrdt_res_red = {{(`VLEN-16){1'b0}},or_16b}; |
| XOR: pmtrdt_res_red = {{(`VLEN-16){1'b0}},xor_16b}; |
| default: pmtrdt_res_red = '0; |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl_q.rdt_opr) |
| SUM: pmtrdt_res_red = {{(`VLEN-8){1'b0}},sum_8b}; |
| MAX: pmtrdt_res_red = {{(`VLEN-8){1'b0}},max_8b}; |
| MIN: pmtrdt_res_red = {{(`VLEN-8){1'b0}},min_8b}; |
| AND: pmtrdt_res_red = {{(`VLEN-8){1'b0}},and_8b}; |
| OR: pmtrdt_res_red = {{(`VLEN-8){1'b0}},or_8b}; |
| XOR: pmtrdt_res_red = {{(`VLEN-8){1'b0}},xor_8b}; |
| default: pmtrdt_res_red = '0; |
| endcase |
| end |
| endcase |
| end |
| end // end if (GEN_RDT == 1'b1) |
| endgenerate |
| |
| // Compare unit |
| generate |
| if (GEN_CMP == 1'b1) begin |
| // cin_data/bin_data |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) // vmadc/vmsbc inst: vd_eew == vs2_eew |
| EEW32: in_data = {pmtrdt_uop.v0_data >> (pmtrdt_uop.uop_index*`VLENB/4)}; |
| EEW16: in_data = {pmtrdt_uop.v0_data >> (pmtrdt_uop.uop_index*`VLENB/2)}; |
| default: in_data = {pmtrdt_uop.v0_data >> (pmtrdt_uop.uop_index*`VLENB)}; |
| endcase |
| end |
| |
| for (i=0; i<`VLENB/4; i++) begin |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) // vmadc/vmsbc inst: vd_eew == vs2_eew |
| EEW32:begin |
| cin_data[4*i] = in_data[i]; |
| cin_data[4*i+1] = in_data[i]; |
| cin_data[4*i+2] = in_data[i]; |
| cin_data[4*i+3] = in_data[i]; |
| end |
| EEW16:begin |
| cin_data[4*i] = in_data[2*i]; |
| cin_data[4*i+1] = in_data[2*i]; |
| cin_data[4*i+2] = in_data[2*i+1]; |
| cin_data[4*i+3] = in_data[2*i+1]; |
| end |
| default:begin |
| cin_data[4*i] = in_data[4*i]; |
| cin_data[4*i+1] = in_data[4*i+1]; |
| cin_data[4*i+2] = in_data[4*i+2]; |
| cin_data[4*i+3] = in_data[4*i+3]; |
| end |
| endcase |
| end |
| end |
| assign bin_data = ~cin_data; |
| |
| // cmp_src1/cmp_src2/cmp_carry_in data |
| for (i=0; i<`VLENB/4; i++) begin : gen_cmp_src_data |
| // cmp_src2 data |
| always_comb begin |
| cmp_src2[4*i][7:0] = pmtrdt_uop.vs2_data[8*(4*i)+:8]; |
| cmp_src2[4*i+1][7:0] = pmtrdt_uop.vs2_data[8*(4*i+1)+:8]; |
| cmp_src2[4*i+2][7:0] = pmtrdt_uop.vs2_data[8*(4*i+2)+:8]; |
| cmp_src2[4*i+3][7:0] = pmtrdt_uop.vs2_data[8*(4*i+3)+:8]; |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:begin |
| cmp_src2[4*i][8] = 1'b0; |
| cmp_src2[4*i+1][8] = 1'b0; |
| cmp_src2[4*i+2][8] = 1'b0; |
| cmp_src2[4*i+3][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+3)+7] : 1'b0; |
| end |
| EEW16:begin |
| cmp_src2[4*i][8] = 1'b0; |
| cmp_src2[4*i+1][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+1)+7] : 1'b0; |
| cmp_src2[4*i+2][8] = 1'b0; |
| cmp_src2[4*i+3][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+3)+7] : 1'b0; |
| end |
| default:begin |
| cmp_src2[4*i][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i)+7] : 1'b0; |
| cmp_src2[4*i+1][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+1)+7] : 1'b0; |
| cmp_src2[4*i+2][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+2)+7] : 1'b0; |
| cmp_src2[4*i+3][8] = rdt_ctrl.sign_opr ? pmtrdt_uop.vs2_data[8*(4*i+3)+7] : 1'b0; |
| end |
| endcase |
| end |
| |
| // cmp_src1 data |
| always_comb begin |
| case (pmtrdt_uop.uop_funct3) |
| OPIVX, |
| OPIVI:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][7:0] = pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+1][7:0] = pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i+2][7:0] = pmtrdt_uop.rs1_data[8*2+:8]; |
| cmp_src1[4*i+3][7:0] = pmtrdt_uop.rs1_data[8*3+:8]; |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][7:0] = ~pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+1][7:0] = ~pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i+2][7:0] = ~pmtrdt_uop.rs1_data[8*2+:8]; |
| cmp_src1[4*i+3][7:0] = ~pmtrdt_uop.rs1_data[8*3+:8]; |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[8*3+7] : ~1'b0; |
| end |
| endcase |
| end |
| EEW16:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][7:0] = pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+1][7:0] = pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i+2][7:0] = pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+3][7:0] = pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][7:0] = ~pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+1][7:0] = ~pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i+2][7:0] = ~pmtrdt_uop.rs1_data[8*0+:8]; |
| cmp_src1[4*i+3][7:0] = ~pmtrdt_uop.rs1_data[8*1+:8]; |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[8*1+7] : ~1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[8*1+7] : ~1'b0; |
| end |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][7:0] = pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+1][7:0] = pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+2][7:0] = pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+3][7:0] = pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][7:0] = ~pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+1][7:0] = ~pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+2][7:0] = ~pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i+3][7:0] = ~pmtrdt_uop.rs1_data[0+:8]; |
| cmp_src1[4*i][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[7] : ~1'b0; |
| cmp_src1[4*i+1][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[7] : ~1'b0; |
| cmp_src1[4*i+2][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[7] : ~1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.rs1_data[7] : ~1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][7:0] = pmtrdt_uop.vs1_data[8*(4*i)+:8]; |
| cmp_src1[4*i+1][7:0] = pmtrdt_uop.vs1_data[8*(4*i+1)+:8]; |
| cmp_src1[4*i+2][7:0] = pmtrdt_uop.vs1_data[8*(4*i+2)+:8]; |
| cmp_src1[4*i+3][7:0] = pmtrdt_uop.vs1_data[8*(4*i+3)+:8]; |
| end |
| default: begin |
| cmp_src1[4*i][7:0] = ~pmtrdt_uop.vs1_data[8*(4*i)+:8]; |
| cmp_src1[4*i+1][7:0] = ~pmtrdt_uop.vs1_data[8*(4*i+1)+:8]; |
| cmp_src1[4*i+2][7:0] = ~pmtrdt_uop.vs1_data[8*(4*i+2)+:8]; |
| cmp_src1[4*i+3][7:0] = ~pmtrdt_uop.vs1_data[8*(4*i+3)+:8]; |
| end |
| endcase |
| case (pmtrdt_uop.vs2_eew) // compare instruction: vs1_eew == vs2_eew |
| EEW32:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+3)+7] : ~1'b0; |
| end |
| endcase |
| end |
| EEW16:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+1)+7] : ~1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+3)+7] : ~1'b0; |
| end |
| endcase |
| end |
| default:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_src1[4*i][8] = 1'b0; |
| cmp_src1[4*i+1][8] = 1'b0; |
| cmp_src1[4*i+2][8] = 1'b0; |
| cmp_src1[4*i+3][8] = 1'b0; |
| end |
| default:begin |
| cmp_src1[4*i][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i)+7] : ~1'b0; |
| cmp_src1[4*i+1][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+1)+7] : ~1'b0; |
| cmp_src1[4*i+2][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+2)+7] : ~1'b0; |
| cmp_src1[4*i+3][8] = rdt_ctrl.sign_opr ? ~pmtrdt_uop.vs1_data[8*(4*i+3)+7] : ~1'b0; |
| end |
| endcase |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // cmp_carry_in data |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT: cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i]; |
| BOUT: cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i]; |
| default: cmp_carry_in[4*i] = 1'b1; |
| endcase |
| cmp_carry_in[4*i+1] = cmp_sum[4*i][8]; |
| cmp_carry_in[4*i+2] = cmp_sum[4*i+1][8]; |
| cmp_carry_in[4*i+3] = cmp_sum[4*i+2][8]; |
| end |
| EEW16:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i]; |
| cmp_carry_in[4*i+2] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i+2]; |
| end |
| BOUT:begin |
| cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i]; |
| cmp_carry_in[4*i+2] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i+2]; |
| end |
| default:begin |
| cmp_carry_in[4*i] = 1'b1; |
| cmp_carry_in[4*i+2] = 1'b1; |
| end |
| endcase |
| cmp_carry_in[4*i+1] = cmp_sum[4*i][8]; |
| cmp_carry_in[4*i+3] = cmp_sum[4*i+2][8]; |
| end |
| default:begin |
| case (rdt_ctrl.cmp_opr) |
| COUT:begin |
| cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i]; |
| cmp_carry_in[4*i+1] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i+1]; |
| cmp_carry_in[4*i+2] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i+2]; |
| cmp_carry_in[4*i+3] = rdt_ctrl.vm ? 1'b0 : cin_data[4*i+3]; |
| end |
| BOUT:begin |
| cmp_carry_in[4*i] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i]; |
| cmp_carry_in[4*i+1] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i+1]; |
| cmp_carry_in[4*i+2] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i+2]; |
| cmp_carry_in[4*i+3] = rdt_ctrl.vm ? 1'b1 : bin_data[4*i+3]; |
| end |
| default:begin |
| cmp_carry_in[4*i] = 1'b1; |
| cmp_carry_in[4*i+1] = 1'b1; |
| cmp_carry_in[4*i+2] = 1'b1; |
| cmp_carry_in[4*i+3] = 1'b1; |
| end |
| endcase |
| end |
| endcase |
| end |
| end // end for (i=0; i<`VLENB/4; i++) begin : gen_cmp_src_data |
| |
| // generate compare result for compare operation |
| for (i=0; i<`VLENB; i++) begin : gen_compare_value |
| assign cmp_sum[i] = cmp_src2[i] + cmp_src1[i] + cmp_carry_in[i]; |
| assign less_than[i] = cmp_sum[i][8]; |
| assign out_data[i] = cmp_sum[i][8]; |
| assign great_than_equal[i] = ~cmp_sum[i][8]; |
| assign equal[i] = cmp_sum[i][7:0] == '0; |
| assign not_equal[i] = cmp_sum[i][7:0] != '0; |
| end |
| |
| // cmp_res data |
| always_comb begin |
| case (rdt_ctrl.cmp_opr) |
| COUT, |
| BOUT:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = out_data[4*j+3]; |
| cmp_res[j+`VLENB/4] = out_data[4*j+3]; |
| cmp_res[j+2*`VLENB/4] = out_data[4*j+3]; |
| cmp_res[j+3*`VLENB/4] = out_data[4*j+3]; |
| end |
| end |
| EEW16:begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = out_data[2*j+1]; |
| cmp_res[j+`VLENB/2] = out_data[2*j+1]; |
| end |
| end |
| default:begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = out_data[j]; |
| end |
| end |
| endcase |
| end |
| NOT_EQUAL:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = |not_equal[4*j+:4]; |
| cmp_res[j+`VLENB/4] = |not_equal[4*j+:4]; |
| cmp_res[j+2*`VLENB/4] = |not_equal[4*j+:4]; |
| cmp_res[j+3*`VLENB/4] = |not_equal[4*j+:4]; |
| end |
| end |
| EEW16: begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = |not_equal[2*j+:2]; |
| cmp_res[j+`VLENB/2] = |not_equal[2*j+:2]; |
| end |
| end |
| default: begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = not_equal[j]; |
| end |
| end |
| endcase |
| end |
| EQUAL:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = &equal[4*j+:4]; |
| cmp_res[j+`VLENB/4] = &equal[4*j+:4]; |
| cmp_res[j+2*`VLENB/4] = &equal[4*j+:4]; |
| cmp_res[j+3*`VLENB/4] = &equal[4*j+:4]; |
| end |
| end |
| EEW16: begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = &equal[2*j+:2]; |
| cmp_res[j+`VLENB/2] = &equal[2*j+:2]; |
| end |
| end |
| default: begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = equal[j]; |
| end |
| end |
| endcase |
| end |
| LESS_THAN:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = less_than[4*j+3]; |
| cmp_res[j+`VLENB/4] = less_than[4*j+3]; |
| cmp_res[j+2*`VLENB/4] = less_than[4*j+3]; |
| cmp_res[j+3*`VLENB/4] = less_than[4*j+3]; |
| end |
| end |
| EEW16: begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = less_than[2*j+1]; |
| cmp_res[j+`VLENB/2] = less_than[2*j+1]; |
| end |
| end |
| default: begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = less_than[j]; |
| end |
| end |
| endcase |
| end |
| LESS_THAN_OR_EQUAL:begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = less_than[4*j+3] | (&equal[4*j+:4]); |
| cmp_res[j+`VLENB/4] = less_than[4*j+3] | (&equal[4*j+:4]); |
| cmp_res[j+2*`VLENB/4] = less_than[4*j+3] | (&equal[4*j+:4]); |
| cmp_res[j+3*`VLENB/4] = less_than[4*j+3] | (&equal[4*j+:4]); |
| end |
| end |
| EEW16: begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = less_than[2*j+1] | (&equal[2*j+:2]); |
| cmp_res[j+`VLENB/2] = less_than[2*j+1] | (&equal[2*j+:2]); |
| end |
| end |
| default: begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = less_than[j] | equal[j]; |
| end |
| end |
| endcase |
| end |
| default:begin //GREAT_THAN |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| cmp_res[j] = great_than_equal[4*j+3] & (|not_equal[4*j+:4]); |
| cmp_res[j+`VLENB/4] = great_than_equal[4*j+3] & (|not_equal[4*j+:4]); |
| cmp_res[j+2*`VLENB/4] = great_than_equal[4*j+3] & (|not_equal[4*j+:4]); |
| cmp_res[j+3*`VLENB/4] = great_than_equal[4*j+3] & (|not_equal[4*j+:4]); |
| end |
| end |
| EEW16: begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| cmp_res[j] = great_than_equal[2*j+1] & (|not_equal[2*j+:2]); |
| cmp_res[j+`VLENB/2] = great_than_equal[2*j+1] & (|not_equal[2*j+:2]); |
| end |
| end |
| default: begin |
| for (int j=0; j<`VLENB; j++) begin |
| cmp_res[j] = great_than_equal[j] & not_equal[j]; |
| end |
| end |
| endcase |
| end |
| endcase |
| end |
| |
| // cmp_res_offset/cmp_res_en_offset |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: cmp_res_offset = pmtrdt_uop.uop_index * `VLENB/4; |
| EEW16: cmp_res_offset = pmtrdt_uop.uop_index * `VLENB/2; |
| default: cmp_res_offset = pmtrdt_uop.uop_index * `VLENB; |
| endcase |
| end |
| assign cmp_res_en_offset = cmp_res_offset >> 2; // max eew is 32b or 4B, then VLEN/EEW_max = 4. |
| |
| // cmp_res_d/cmp_res_q |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32: cmp_res_en = {'0, 1'b1} << cmp_res_en_offset; |
| EEW16: cmp_res_en = {'0, 2'b11} << cmp_res_en_offset; |
| default: cmp_res_en = {'0, 4'b1111} << cmp_res_en_offset; |
| endcase |
| end |
| assign cmp_res_d = {'0, cmp_res} << cmp_res_offset; |
| for (i=0; i<(2*`VLENB); i++) begin |
| edff #(.T(logic[`VLEN/32-1:0])) cmp_res_reg (.q(cmp_res_q[`VLEN/32*i+:`VLEN/32]), .d(cmp_res_d[`VLEN/32*i+:`VLEN/32]), .e(cmp_res_en[i] & pmtrdt_uop_valid & pmtrdt_uop_ready), .clk(clk), .rst_n(rst_n)); |
| end |
| |
| // cmp_vstart value is from the first uop of compare instruction |
| assign cmp_vstart_d = pmtrdt_uop.vstart; |
| assign cmp_vstart_en = pmtrdt_uop.first_uop_valid & pmtrdt_uop_valid & pmtrdt_uop_ready; |
| edff #(.T(logic[`VSTART_WIDTH-1:0])) cmp_vstart_reg (.q(cmp_vstart_q), .d(cmp_vstart_d), .e(cmp_vstart_en), .clk(clk), .rst_n(rst_n)); |
| // pmtrdt_res_cmp |
| for (i=0; i<`VLEN; i++) begin |
| always_comb begin |
| if (i < cmp_vstart_q) pmtrdt_res_cmp[i] = rdt_ctrl_q.vs3_data[i]; |
| else if (i >= rdt_ctrl_q.cmp_evl) pmtrdt_res_cmp[i] = rdt_ctrl_q.vs3_data[i]; |
| else begin |
| case (rdt_ctrl_q.cmp_opr) |
| COUT, |
| BOUT: pmtrdt_res_cmp[i] = cmp_res_q[i]; |
| default: |
| if (rdt_ctrl_q.vm) pmtrdt_res_cmp[i] = cmp_res_q[i]; |
| else if (rdt_ctrl_q.v0_data[i]) pmtrdt_res_cmp[i] = cmp_res_q[i]; |
| else pmtrdt_res_cmp[i] = rdt_ctrl_q.vs3_data[i]; |
| endcase |
| end |
| end |
| end |
| end // end if (GEN_CMP == 1'b1) |
| endgenerate |
| |
| // Permutation unit |
| // offset: select element |
| generate |
| if (GEN_PMT == 1'b1) begin |
| // slide/gather instruction |
| // vd data can be driven from all vs2 datas, |
| // so PMT can not start to execute slide/gather uop |
| // unless all uop(s) has been put in RS. |
| assign rs_entry_valid = f_rs_decoder(uop_cnt); |
| always_comb begin |
| pmt_go = 1'b0; |
| for (int j=0; j<`PMTRDT_RS_DEPTH; j++) begin |
| pmt_go = pmt_go | (uop_data[j].last_uop_valid & rs_entry_valid[j]); |
| end |
| pmt_go = ~rdt_ctrl.compress & compress_ctrl_empty & (uop_type==PERMUTATION) & uop_data[0].first_uop_valid & pmt_go; |
| end |
| cdffr #(.T(logic)) pmt_go_reg (.q(pmt_go_q), .d(pmt_go), .c(trap_flush_rvv), .e(1'b1), .clk(clk), .rst_n(rst_n)); |
| |
| for (i=0; i<`VLENB; i++) begin |
| always_comb begin |
| case(pmt_ctrl.pmt_opr) |
| SLIDE_UP:begin |
| if (pmtrdt_uop.uop_funct3 == OPMVX) |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-4; |
| EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-2; |
| default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-1; |
| endcase |
| else |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (4*pmtrdt_uop.rs1_data); |
| EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (2*pmtrdt_uop.rs1_data); |
| default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+ i - pmtrdt_uop.rs1_data; |
| endcase |
| end |
| SLIDE_DOWN:begin |
| if (pmtrdt_uop.uop_funct3 == OPMVX) |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+4; |
| EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+2; |
| default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+1; |
| endcase |
| else |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (4*pmtrdt_uop.rs1_data); |
| EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (2*pmtrdt_uop.rs1_data); |
| default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + pmtrdt_uop.rs1_data; |
| endcase |
| end |
| GATHER:begin |
| case (pmtrdt_uop.uop_funct3) |
| OPIVX, |
| OPIVI:begin |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:offset[i] = i%4 + {pmtrdt_uop.rs1_data,2'b0}; |
| EEW16:offset[i] = i%2 + {pmtrdt_uop.rs1_data,1'b0}; |
| default:offset[i] = pmtrdt_uop.rs1_data; |
| endcase |
| end |
| default:begin |
| case (pmtrdt_uop.vs1_eew) |
| EEW32: offset[i] = i%4 + (4*{{(`XLEN-32){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[32*((i/4)%(`VLENB/4))+:32]}); |
| EEW16: begin |
| case (pmtrdt_uop.vs2_eew) // vrgatherei16 |
| EEW32:offset[i] = i%4 + (4*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[16*((pmt_uop_done_cnt_q*`VLENB/4+i/4)%(`VLENB/2))+:16]}); |
| EEW16:offset[i] = i%2 + (2*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/2+i/2)/(`VLENB/2)].vs1_data[16*((i/2)%(`VLENB/2))+:16]}); |
| default:offset[i] = {{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[16*(i%(`VLENB/2))+:16]}; |
| endcase |
| end |
| default: offset[i] = {{(`XLEN-8){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[8*(i%(`VLENB))+:8]}; |
| endcase |
| end |
| endcase |
| end |
| default: offset[i] = i; |
| endcase |
| end |
| end |
| |
| //select scalar value |
| //for vslide1up, vd[0] = x[rs1] |
| //for vslide1down, vd[vl-1] = x[rs1] |
| always_comb begin |
| if (pmtrdt_uop.uop_funct3 == OPMVX) begin |
| case (pmt_ctrl.pmt_opr) |
| SLIDE_UP:begin |
| if (pmt_uop_done_cnt_q == 0) |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:sel_scalar = 'hF; |
| EEW16:sel_scalar = 'h3; |
| default:sel_scalar = 'h1; |
| endcase |
| else |
| sel_scalar = '0; |
| end |
| SLIDE_DOWN:begin |
| case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew |
| EEW32:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/4) >= rdt_ctrl.vl ? 'hF << ((rdt_ctrl.vl-1)%(`VLENB/4))*4 : '0; |
| EEW16:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/2) >= rdt_ctrl.vl ? 'h3 << ((rdt_ctrl.vl-1)%(`VLENB/2))*2 : '0; |
| default:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*`VLENB >= rdt_ctrl.vl ? 'h1 << ((rdt_ctrl.vl-1)%(`VLENB))*1 : '0; |
| endcase |
| end |
| default:sel_scalar = '0; |
| endcase |
| end else begin |
| sel_scalar = '0; |
| end |
| end |
| |
| always_comb begin |
| if (pmtrdt_uop.vs2_eew == EEW8 && pmtrdt_uop.vs1_eew == EEW16) |
| for (int j=0; j<`VLMAX_MAX/2; j++) begin |
| pmt_vs2_data[j] = uop_data[j/(`VLENB/2)].vs2_data[8*(j%(`VLENB))+:8]; |
| pmt_vs2_data[j+`VLMAX_MAX/2] = '0; |
| end |
| else |
| for (int j=0; j<`VLMAX_MAX; j++) |
| pmt_vs2_data[j] = uop_data[j/(`VLENB)].vs2_data[8*(j%(`VLENB))+:8]; |
| end |
| |
| for (i=0; i<`VLMAX_MAX; i++) begin |
| assign pmt_vs3_data[i] = uop_data[i/(`VLENB)].vs3_data[8*(i%(`VLENB))+:8]; |
| end |
| |
| // permutation instruction (vslide1up/vslide1down): rs1_eew == vs2_eew |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:pmt_rs1_data = {(`XLEN/32){pmtrdt_uop.rs1_data[31:0]}}; |
| EEW16:pmt_rs1_data = {(`XLEN/16){pmtrdt_uop.rs1_data[15:0]}}; |
| default:pmt_rs1_data = {(`XLEN/8){pmtrdt_uop.rs1_data[7:0]}}; |
| endcase |
| end |
| |
| // pmt_res_d/pmt_res_q |
| assign pmt_res_en = pmt_go; |
| for (i=0; i<`VLENB; i++) begin |
| always_comb begin |
| if (sel_scalar[i]) pmt_res_d[i] = pmt_rs1_data[8*(i%4)+:8]; |
| else |
| case (pmt_ctrl.pmt_opr) |
| SLIDE_UP:begin |
| case (pmtrdt_uop.vs2_eew) // permutation instruction |
| // TODO(derekjchow): Fix me |
| // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]]; |
| // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]]; |
| // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]]; |
| default: pmt_res_d[i] = 0; |
| endcase |
| end |
| SLIDE_DOWN:begin |
| case (pmtrdt_uop.vs2_eew) |
| // TODO(derekjchow): Fix me |
| // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)]; |
| // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)]; |
| // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)]; |
| default: pmt_res_d[i] = 0; |
| endcase |
| end |
| default: begin |
| case (pmtrdt_uop.vs2_eew) |
| // TODO(derekjchow): Fix me |
| // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]]; |
| // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]]; |
| // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]]; |
| default: pmt_res_d[i] = 0; |
| endcase |
| end |
| endcase |
| end |
| edff #(.T(logic[7:0])) pmt_res_reg (.q(pmt_res_q[i]), .d(pmt_res_d[i]), .e(pmt_res_en), .clk(clk), .rst_n(rst_n)); |
| assign pmtrdt_res_pmt[i*8+:8] = pmt_res_q[i]; |
| end |
| |
| // pmt_uop_done_cnt_d/pmt_uop_done_cnt_q |
| assign pmt_uop_done_cnt_d = pmt_uop_done_cnt_q + 1'b1; |
| cdffr #(.T(logic[`UOP_INDEX_WIDTH-1:0])) pmt_uop_done_cnt_reg (.q(pmt_uop_done_cnt_q), .d(pmt_uop_done_cnt_d), .c(uop_data[pmt_uop_done_cnt_q].last_uop_valid | trap_flush_rvv), .e(pmt_go), .clk(clk), .rst_n(rst_n)); |
| |
| // Compress instruction |
| // compress instruction is a specified instruction in PMT. |
| // the vl of vd in compress can not be acknowledged untill decode vs1 value. |
| // compress_mask_d is driven from shifted vs1_data based on vs2_eew |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) // vcompress instruction: vd_eew == vs2_eew |
| EEW32:compress_mask_d = pmtrdt_uop.uop_index == '0 ? pmtrdt_uop.vs1_data >> (`VLENB/4) : compress_mask_q >> (`VLENB/4); |
| EEW16:compress_mask_d = pmtrdt_uop.uop_index == '0 ? pmtrdt_uop.vs1_data >> (`VLENB/2) : compress_mask_q >> (`VLENB/2); |
| default:compress_mask_d = pmtrdt_uop.uop_index == '0 ? pmtrdt_uop.vs1_data >> `VLENB : compress_mask_q >> `VLENB; |
| endcase |
| end |
| assign compress_mask_en = pmtrdt_uop_valid & pmtrdt_uop_ready; |
| edff #(.T(logic[`VLEN-1:0])) compress_mask_reg (.q(compress_mask_q), .d(compress_mask_d), .e(compress_mask_en), .clk(clk), .rst_n(rst_n)); |
| |
| // compress_enable is from vs1_data[0+:N] based on vs2_eew |
| // and then be extended to `VLENB bits. |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:begin |
| for (int j=0; j<`VLENB/4; j++) begin |
| compress_enable[4*j+:4] = pmtrdt_uop.uop_index == '0 ? {4{pmtrdt_uop.vs1_data[j]}} : {4{compress_mask_q[j]}}; |
| end |
| end |
| EEW16:begin |
| for (int j=0; j<`VLENB/2; j++) begin |
| compress_enable[2*j+:2] = pmtrdt_uop.uop_index == '0 ? {2{pmtrdt_uop.vs1_data[j]}} : {2{compress_mask_q[j]}}; |
| end |
| end |
| default:compress_enable = pmtrdt_uop.uop_index == '0 ? pmtrdt_uop.vs1_data[`VLENB-1:0] : compress_mask_q[`VLENB-1:0]; |
| endcase |
| end |
| |
| // compress_body is driven from vl & uop_index |
| // 0: tail element; 1: body element |
| always_comb begin |
| case (pmtrdt_uop.vs2_eew) |
| EEW32:compress_body = (pmtrdt_uop.vl > pmtrdt_uop.uop_index*`VLENB/4) ? ~({`VLENB{1'b1}} << (4*(pmtrdt_uop.vl - pmtrdt_uop.uop_index*`VLENB/4))) : '0; |
| EEW16:compress_body = (pmtrdt_uop.vl > pmtrdt_uop.uop_index*`VLENB/2) ? ~({`VLENB{1'b1}} << (2*(pmtrdt_uop.vl - pmtrdt_uop.uop_index*`VLENB/2))) : '0; |
| default:compress_body = (pmtrdt_uop.vl > pmtrdt_uop.uop_index*`VLENB) ? ~({`VLENB{1'b1}} << (pmtrdt_uop.vl - pmtrdt_uop.uop_index*`VLENB)) : '0; |
| endcase |
| end |
| |
| // compress_cnt indicates how much bytes have been compressed |
| always_comb begin |
| if (pmtrdt_uop.uop_index == '0) compress_cnt_d = f_sum(compress_enable & compress_body); |
| else compress_cnt_d = compress_cnt_q + f_sum(compress_enable & compress_body); |
| end |
| assign compress_cnt_en = pmtrdt_uop_valid & pmtrdt_uop_ready & rdt_ctrl.compress; |
| assign compress_cnt_clr = ~compress_cnt_gt_vlenb & rdt_ctrl_q.last_uop_valid; |
| cdffr #(.T(logic[VLENB_WIDTH:0])) compress_cnt_reg (.q(compress_cnt_q), .d(compress_cnt_d), .c(~compress_cnt_en & compress_cnt_clr | trap_flush_rvv), .e(compress_cnt_en), .clk(clk), .rst_n(rst_n)); |
| cdffr #(.T(logic[VLENB_WIDTH:0])) compress_cnt_reg_reg (.q(compress_cnt_qq), .d(compress_cnt_q), .c(compress_cnt_clr | trap_flush_rvv), .e(1'b1), .clk(clk), .rst_n(rst_n)); |
| |
| // set if compress more than or equal 16 byte and then write the result to ROB |
| assign compress_cnt_ge_vlenb = compress_cnt_qq[VLENB_WIDTH] ^ compress_cnt_q[VLENB_WIDTH]; |
| assign compress_cnt_gt_vlenb = compress_cnt_ge_vlenb & (|compress_cnt_q[VLENB_WIDTH-1:0]); |
| |
| // compress_offset select elements of vs2_data and compress to compress_value |
| assign compress_offset = f_compress_offset(compress_enable); |
| for (i=0; i<`VLENB; i++) begin |
| assign compress_value[i] = compress_offset[i] == '1 ? '0 : pmtrdt_uop.vs2_data[8*compress_offset[i]+:8]; |
| end |
| |
| // compress_res is driven by compress_value and compress_cnt. |
| always_comb begin |
| if (pmtrdt_uop.first_uop_valid) compress_res_d = {'0, compress_value}; |
| else compress_res_d = f_circular_shift(compress_value, compress_cnt_q); |
| end |
| |
| // compress_res_en |
| always_comb begin |
| if (compress_ctrl_push) |
| if (pmtrdt_uop.first_uop_valid) compress_res_en = {'0, f_pack_1s(compress_enable)}; |
| else compress_res_en = f_circular_en(compress_enable,compress_cnt_q); |
| else |
| compress_res_en = '0; |
| end |
| for (i=0; i<2*`VLENB; i++) edff #(.T(logic[7:0])) compress_res_reg (.q(compress_res_q[i]), .d(compress_res_d[i]), .e(compress_res_en[i]), .clk(clk), .rst_n(rst_n)); |
| |
| // pmtrdt_res_compress |
| assign valid_num[1] = compress_cnt_q[VLENB_WIDTH:0] - `VLENB; |
| assign valid_num[0] = compress_cnt_q[VLENB_WIDTH:0]; |
| always_comb begin |
| if (rdt_ctrl_q.last_uop_valid) begin |
| if (compress_cnt_qq[VLENB_WIDTH]) |
| pmtrdt_res_compress = f_res_compress_merge(compress_ctrl_ex1.vs3_data, compress_res_q[`VLENB+:`VLENB], valid_num[1]); |
| else |
| pmtrdt_res_compress = f_res_compress_merge(compress_ctrl_ex1.vs3_data, compress_res_q[0+:`VLENB], valid_num[0]); |
| end else begin |
| if (compress_cnt_qq[VLENB_WIDTH]) |
| pmtrdt_res_compress = compress_res_q[2*`VLENB-1:`VLENB]; |
| else |
| pmtrdt_res_compress = compress_res_q[`VLENB-1:0]; |
| end |
| end |
| |
| // compress control fifo |
| // based on the value of vs1, one or multiple uop(s) writes one vd. |
| // the remaining elements of vd are treated as tail elements. |
| `ifdef TB_SUPPORT |
| assign compress_ctrl_ex0.uop_pc = pmtrdt_uop.uop_pc; |
| `endif |
| assign compress_ctrl_ex0.rob_entry = pmtrdt_uop.rob_entry; |
| assign compress_ctrl_ex0.vs3_data = pmtrdt_uop.vs3_data; |
| assign compress_ctrl_ex0.last_uop_valid = pmtrdt_uop.last_uop_valid; |
| |
| assign compress_ctrl_push = pmtrdt_uop_valid & pmtrdt_uop_ready & rdt_ctrl.compress; |
| assign compress_ctrl_pop = (compress_cnt_ge_vlenb | rdt_ctrl_q.last_uop_valid & rdt_ctrl_q.compress); |
| |
| multi_fifo #( |
| .T (COMPRESS_CTRL_t), |
| .M (1), |
| .N (1), |
| .DEPTH (`EMUL_MAX), |
| .ASYNC_RSTN (1) |
| ) compress_ctrl_fifo ( |
| // global |
| .clk (clk), |
| .rst_n (rst_n), |
| // write |
| .push (compress_ctrl_push), |
| .datain (compress_ctrl_ex0), |
| // read |
| .pop (compress_ctrl_pop), |
| .dataout (compress_ctrl_ex1), |
| // fifo status |
| .full (), |
| .almost_full (), |
| .empty (compress_ctrl_empty), |
| .almost_empty (), |
| .clear (trap_flush_rvv), |
| .fifo_data (), |
| .wptr (), |
| .rptr (), |
| .entry_count () |
| ); |
| |
| end // if (GEN_PMT == 1'b1) |
| endgenerate |
| |
| // output result |
| always_comb begin |
| case (uop_type_q) |
| PERMUTATION: pmtrdt_res_valid = rdt_ctrl_q.compress ? compress_ctrl_pop |
| : pmt_go_q; |
| default: pmtrdt_res_valid = rdt_ctrl_q.last_uop_valid; |
| endcase |
| end |
| |
| always_comb begin |
| `ifdef TB_SUPPORT |
| // uop_pc |
| case (uop_type_q) |
| PERMUTATION: pmtrdt_res.uop_pc = rdt_ctrl_q.compress ? compress_ctrl_ex1.uop_pc : pmt_ctrl_q.uop_pc; |
| default: pmtrdt_res.uop_pc = rdt_ctrl_q.uop_pc; |
| endcase |
| `endif |
| |
| // rob_entry |
| case (uop_type_q) |
| PERMUTATION:pmtrdt_res.rob_entry = rdt_ctrl_q.compress ? compress_ctrl_ex1.rob_entry : pmt_ctrl_q.rob_entry; |
| default: pmtrdt_res.rob_entry = rdt_ctrl_q.rob_entry; |
| endcase |
| |
| // write valid |
| pmtrdt_res.w_valid = 1'b1; |
| |
| // saturate |
| pmtrdt_res.vsaturate = '0; |
| |
| // data |
| case (uop_type_q) |
| PERMUTATION: pmtrdt_res.w_data = rdt_ctrl_q.compress ? pmtrdt_res_compress : pmtrdt_res_pmt; |
| REDUCTION: pmtrdt_res.w_data = pmtrdt_res_red; |
| COMPARE: pmtrdt_res.w_data = pmtrdt_res_cmp; |
| default: pmtrdt_res.w_data = pmtrdt_res_cmp; |
| endcase |
| end |
| |
| // pmtrdt_uop_ready: |
| // 1. CMP instruction - always 1 |
| // 2. RDT instruction |
| // VWREDSUMU&VWREDSUM - set 1 only if red_widen_sum_flag toggle to 1. |
| // the others - clear 0 |
| // 3. PMT instruction - set 1 only if last_uop_valid is asserted. |
| cdffr #(.T(logic)) wredsum_flag_reg (.q(red_widen_sum_flag), .d(~red_widen_sum_flag), .c(trap_flush_rvv), .e(rdt_ctrl.widen & pmtrdt_uop_valid), .clk(clk), .rst_n(rst_n)); |
| always_comb begin |
| if (compress_ctrl_empty) |
| case (uop_type) |
| PERMUTATION: pmtrdt_uop_ready = rdt_ctrl.compress ? (compress_ctrl_ex1.last_uop_valid | ~rdt_ctrl_q.last_uop_valid) |
| : uop_data[pmt_uop_done_cnt_q].last_uop_valid || ~uop_data[0].first_uop_valid; |
| REDUCTION: |
| if (rdt_ctrl.widen) pmtrdt_uop_ready = red_widen_sum_flag; |
| else pmtrdt_uop_ready = 1'b1; |
| default: pmtrdt_uop_ready = 1'b1; |
| endcase |
| else pmtrdt_uop_ready = rdt_ctrl.compress ? (compress_ctrl_ex1.last_uop_valid | ~rdt_ctrl_q.last_uop_valid) |
| : pmt_go & uop_data[pmt_uop_done_cnt_q].last_uop_valid | ~uop_data[0].first_uop_valid; |
| end |
| |
| // ---function-------------------------------------------------------- |
| // f_sum: sum how many bits are asserted. |
| function [VLENB_WIDTH:0] f_sum; |
| input [`VLENB-1:0] vector_bits; |
| |
| int i; |
| logic [VLENB_WIDTH:0] sum_val; |
| begin |
| sum_val = '0; |
| for (i=0; i<`VLENB; i++) begin |
| sum_val = sum_val + vector_bits[i]; |
| end |
| f_sum = sum_val; |
| end |
| endfunction |
| |
| // f_compress_offset: extract valid bit and put its index to offset |
| function [`VLENB-1:0][VLENB_WIDTH:0] f_compress_offset; |
| input [`VLENB-1:0] enables; |
| |
| int i,j; |
| logic [`VLENB-1:0][VLENB_WIDTH:0] results; |
| begin |
| j = 0; |
| for (i=0; i<`VLENB; i++) results[i] = '1; |
| for (i=0; i<`VLENB; i++) begin |
| if (enables[i]) begin |
| results[j] = i; |
| j++; |
| end |
| end |
| f_compress_offset = results; |
| end |
| endfunction |
| |
| // f_circular_shift: circular shift result to proper site |
| function [2*`VLENB-1:0][7:0] f_circular_shift; |
| input [`VLENB-1:0][7:0] value; |
| input [VLENB_WIDTH:0] shift; |
| |
| logic [`VLEN-1:0] value_tmp; |
| logic [`VLEN-1:0] buf2,buf1,buf0; |
| logic [1:0][`VLEN-1:0] result; |
| begin |
| value_tmp = value; |
| {buf2,buf1,buf0} = value_tmp << (shift*8); |
| result = shift[VLENB_WIDTH] ? {buf1, buf2} : {buf1,buf0}; |
| f_circular_shift = result; |
| end |
| endfunction |
| |
| // f_pack_1s: collect all 1s and pack themsigned(dest) < $signed(src2) |
| function [`VLENB-1:0] f_pack_1s; |
| input [`VLENB-1:0] value; |
| |
| int i,j; |
| logic [`VLENB-1:0] result; |
| begin |
| j = 0; |
| result = '0; |
| for (i=0; i<`VLENB; i++) |
| if (value[i]) begin |
| result[j] = 1'b1; |
| j++; |
| end |
| f_pack_1s = result; |
| end |
| endfunction |
| |
| // f_circular_en: circular shift enable signals |
| function [2*`VLENB-1:0] f_circular_en; |
| input [`VLENB-1:0] value; |
| input [VLENB_WIDTH:0] shift; |
| |
| logic [`VLENB-1:0] value_pack_1s; |
| logic [`VLENB-1:0] en2,en1,en0; |
| logic [1:0][`VLENB-1:0] result; |
| begin |
| value_pack_1s = f_pack_1s(value); |
| {en2,en1,en0} = value_pack_1s << shift; |
| result = shift[VLENB_WIDTH] ? {en1, en2} : {en1, en0}; |
| f_circular_en = result; |
| end |
| endfunction |
| |
| // f_res_compress_merge: merge raw data with copmress result |
| function [`VLEN-1:0] f_res_compress_merge; |
| input [`VLENB-1:0][7:0] raw_data; |
| input [`VLENB-1:0][7:0] res_data; |
| input [VLENB_WIDTH:0] valid_num; |
| |
| int i; |
| logic [`VLENB-1:0] valid; |
| logic [`VLENB-1:0][7:0] result; |
| begin |
| for (i=0; i<`VLENB; i++) begin |
| if (i < valid_num) valid[i] = 1'b1; |
| else valid[i] = 1'b0; |
| result[i] = valid[i] ? res_data[i] : raw_data[i]; |
| end |
| f_res_compress_merge = result; |
| end |
| endfunction |
| |
| // f_rs_decoder: decoder for reservation station |
| function [`PMTRDT_RS_DEPTH-1:0] f_rs_decoder; |
| input [$clog2(`PMTRDT_RS_DEPTH):0] cnt; |
| |
| logic [`PMTRDT_RS_DEPTH-1:0] result; |
| begin |
| result = '1 << cnt; |
| result = ~result; |
| f_rs_decoder = result; |
| end |
| endfunction |
| |
| endmodule |