blob: 77a66bd0ee475b3ccd3850b0d5bec441430fa3b0 [file] [log] [blame]
`ifndef HDL_VERILOG_RVV_DESIGN_RVV_SVH
`include "rvv_backend.svh"
`endif
`ifndef DIV_DEFINE_SVH
`include "rvv_backend_div.svh"
`endif
`ifndef RVV_ASSERT__SVH
`include "rvv_backend_sva.svh"
`endif
module rvv_backend_div_unit
(
clk,
rst_n,
div_uop_valid,
div_uop,
result_valid,
result,
result_ready,
trap_flush_rvv
);
//
// interface signals
//
// global signals
input logic clk;
input logic rst_n;
// DIV RS handshake signals
input logic div_uop_valid;
input DIV_RS_t div_uop;
// DIV send result signals to ROB
output logic result_valid;
output PU2ROB_t result;
input logic result_ready;
// trap-flush
input logic trap_flush_rvv;
//
// internal signals
//
// DIV_RS_t struct signals
logic [`ROB_DEPTH_WIDTH-1:0] rob_entry;
FUNCT6_u uop_funct6;
logic [`FUNCT3_WIDTH-1:0] uop_funct3;
logic [`VLEN-1:0] vs1_data;
logic vs1_data_valid;
logic [`VLEN-1:0] vs2_data;
logic vs2_data_valid;
EEW_e vs2_eew;
logic [`XLEN-1:0] rs1_data;
logic rs1_data_valid;
// execute
logic uop_valid;
logic [`VLENB/2-1:0][`BYTE_WIDTH-1:0] src2_data8;
logic [`VLEN/`HWORD_WIDTH/2-1:0][`HWORD_WIDTH-1:0] src2_data16;
logic [`VLEN/`WORD_WIDTH-1:0][`WORD_WIDTH-1:0] src2_data32;
logic [`VLENB/2-1:0][`BYTE_WIDTH-1:0] src1_data8;
logic [`VLEN/`HWORD_WIDTH/2-1:0][`HWORD_WIDTH-1:0] src1_data16;
logic [`VLEN/`WORD_WIDTH-1:0][`WORD_WIDTH-1:0] src1_data32;
logic [`VLENB/2-1:0][`BYTE_WIDTH-1:0] quotient8;
logic [`VLEN/`HWORD_WIDTH/2-1:0][`HWORD_WIDTH-1:0] quotient16;
logic [`VLEN/`WORD_WIDTH-1:0][`WORD_WIDTH-1:0] quotient32;
logic [`VLENB/2-1:0][`BYTE_WIDTH-1:0] remainder8;
logic [`VLEN/`HWORD_WIDTH/2-1:0][`HWORD_WIDTH-1:0] remainder16;
logic [`VLEN/`WORD_WIDTH-1:0][`WORD_WIDTH-1:0] remainder32;
logic [`VLENB/2-1:0] result_valid8;
logic [`VLEN/`HWORD_WIDTH/2-1:0] result_valid16;
logic [`VLEN/`WORD_WIDTH-1:0] result_valid32;
logic [`VLEN-1:0] result_data;
logic result_all_valid;
DIV_SIGN_SRC_e opcode;
// for-loop
genvar j;
//
// prepare source data to calculate
//
// split ALU_RS_t struct
assign rob_entry = div_uop.rob_entry;
assign uop_funct6 = div_uop.uop_funct6;
assign uop_funct3 = div_uop.uop_funct3;
assign vs1_data = div_uop.vs1_data;
assign vs1_data_valid = div_uop.vs1_data_valid;
assign vs2_data = div_uop.vs2_data;
assign vs2_data_valid = div_uop.vs2_data_valid;
assign vs2_eew = div_uop.vs2_eew;
assign rs1_data = div_uop.rs1_data;
assign rs1_data_valid = div_uop.rs1_data_valid;
//
// prepare source data
//
// prepare valid signal
always_comb begin
// initial the data
uop_valid = 'b0;
case(uop_funct3)
OPMVV: begin
case(uop_funct6.ari_funct6)
VDIVU,
VDIV,
VREMU,
VREM: begin
uop_valid = div_uop_valid&vs2_data_valid&vs1_data_valid;
end
endcase
end
OPMVX: begin
case(uop_funct6.ari_funct6)
VDIVU,
VDIV,
VREMU,
VREM: begin
uop_valid = div_uop_valid&vs2_data_valid&rs1_data_valid;
end
endcase
end
endcase
end
// prepare source data
always_comb begin
// initial the data
src2_data8 = 'b0;
src2_data16 = 'b0;
src2_data32 = 'b0;
src1_data8 = 'b0;
src1_data16 = 'b0;
src1_data32 = 'b0;
case(uop_funct3)
OPMVV: begin
case(uop_funct6.ari_funct6)
VDIVU,
VREMU: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
src2_data8[i] = vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
src1_data8[i] = vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
src2_data16[i-`VLENB/2] = {8'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data16[i-`VLENB/2] = {8'b0,vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
src2_data32[i-`VLENB*3/4] = {24'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data32[i-`VLENB*3/4] = {24'b0,vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
src2_data16[i] = vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
src1_data16[i] = vs1_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
src2_data32[i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
src1_data32[i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs1_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
src2_data32[i] = vs2_data[i*`WORD_WIDTH +: `WORD_WIDTH];
src1_data32[i] = vs1_data[i*`WORD_WIDTH +: `WORD_WIDTH];
end
end
endcase
end
VDIV,
VREM: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
src2_data8[i] = vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
src1_data8[i] = vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
src2_data16[i-`VLENB/2] = {{8{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data16[i-`VLENB/2] = {{8{vs1_data[(i+1)*`BYTE_WIDTH-1]}},vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
src2_data32[i-`VLENB*3/4] = {{24{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data32[i-`VLENB*3/4] = {{24{vs1_data[(i+1)*`BYTE_WIDTH-1]}},vs1_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
src2_data16[i] = vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
src1_data16[i] = vs1_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
src2_data32[i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
src1_data32[i-`VLEN/`HWORD_WIDTH/2] = {{16{vs1_data[(i+1)*`HWORD_WIDTH-1]}},vs1_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
src2_data32[i] = vs2_data[i*`WORD_WIDTH +: `WORD_WIDTH];
src1_data32[i] = vs1_data[i*`WORD_WIDTH +: `WORD_WIDTH];
end
end
endcase
end
endcase
end
OPMVX: begin
case(uop_funct6.ari_funct6)
VDIVU,
VREMU: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
src2_data8[i] = vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
src1_data8[i] = rs1_data[0 +: `BYTE_WIDTH];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
src2_data16[i-`VLENB/2] = {8'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data16[i-`VLENB/2] = {8'b0,rs1_data[0 +: `BYTE_WIDTH]};
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
src2_data32[i-`VLENB*3/4] = {24'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data32[i-`VLENB*3/4] = {24'b0,rs1_data[0 +: `BYTE_WIDTH]};
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
src2_data16[i] = vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
src1_data16[i] = rs1_data[0 +: `HWORD_WIDTH];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
src2_data32[i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
src1_data32[i-`VLEN/`HWORD_WIDTH/2] = {16'b0,rs1_data[0 +: `HWORD_WIDTH]};
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
src2_data32[i] = vs2_data[i*`WORD_WIDTH +: `WORD_WIDTH];
src1_data32[i] = rs1_data[0 +: `WORD_WIDTH];
end
end
endcase
end
VDIV,
VREM: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
src2_data8[i] = vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH];
src1_data8[i] = rs1_data[0 +: `BYTE_WIDTH];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
src2_data16[i-`VLENB/2] = {{8{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data16[i-`VLENB/2] = {{8{rs1_data[ `BYTE_WIDTH-1]}},rs1_data[0 +: `BYTE_WIDTH]};
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
src2_data32[i-`VLENB*3/4] = {{24{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
src1_data32[i-`VLENB*3/4] = {{24{rs1_data[ `BYTE_WIDTH-1]}},rs1_data[0 +: `BYTE_WIDTH]};
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
src2_data16[i] = vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH];
src1_data16[i] = rs1_data[0 +: `HWORD_WIDTH];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
src2_data32[i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
src1_data32[i-`VLEN/`HWORD_WIDTH/2] = {{16{rs1_data[ `HWORD_WIDTH-1]}},rs1_data[0 +: `HWORD_WIDTH]};
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
src2_data32[i] = vs2_data[i*`WORD_WIDTH +: `WORD_WIDTH];
src1_data32[i] = rs1_data[0 +: `WORD_WIDTH];
end
end
endcase
end
endcase
end
endcase
end
// get opcode for divider
always_comb begin
// initial the data
opcode = DIV_SIGN;
// prepare source data
case(uop_funct3)
OPMVV,
OPMVX: begin
case(uop_funct6.ari_funct6)
VDIV,
VREM: begin
opcode = DIV_SIGN;
end
VDIVU,
VREMU: begin
opcode = DIV_ZERO;
end
endcase
end
endcase
end
//
// calculate the result
//
generate
for(j=0;j<`VLENB/2;j++) begin: DIVIDER8
rvv_backend_div_unit_divider
#(
.DIV_WIDTH (`BYTE_WIDTH)
)
divider_8bit
(
.clk (clk),
.rst_n (rst_n),
.div_valid (uop_valid&(vs2_eew==EEW8)),
.opcode (opcode),
.src2_dividend (src2_data8[j]),
.src1_divisor (src1_data8[j]),
.result_quotient (quotient8[j]),
.result_remainder (remainder8[j]),
.result_valid (result_valid8[j]),
.result_ready (result_ready&result_valid),
.trap_flush_rvv (trap_flush_rvv)
);
end
endgenerate
generate
for(j=0;j<`VLEN/`HWORD_WIDTH/2;j++) begin: DIVIDER16
rvv_backend_div_unit_divider
#(
.DIV_WIDTH (`HWORD_WIDTH)
)
divider_16bit
(
.clk (clk),
.rst_n (rst_n),
.div_valid (uop_valid&(vs2_eew!=EEW32)),
.opcode (opcode),
.src2_dividend (src2_data16[j]),
.src1_divisor (src1_data16[j]),
.result_quotient (quotient16[j]),
.result_remainder (remainder16[j]),
.result_valid (result_valid16[j]),
.result_ready (result_ready&result_valid),
.trap_flush_rvv (trap_flush_rvv)
);
end
endgenerate
generate
for(j=0;j<`VLEN/`WORD_WIDTH;j++) begin: DIVIDER32
rvv_backend_div_unit_divider
#(
.DIV_WIDTH (`WORD_WIDTH)
)
divider_32bit
(
.clk (clk),
.rst_n (rst_n),
.div_valid (uop_valid),
.opcode (opcode),
.src2_dividend (src2_data32[j]),
.src1_divisor (src1_data32[j]),
.result_quotient (quotient32[j]),
.result_remainder (remainder32[j]),
.result_valid (result_valid32[j]),
.result_ready (result_ready&result_valid),
.trap_flush_rvv (trap_flush_rvv)
);
end
endgenerate
// check whether all the results are gotten
always_comb begin
result_all_valid = 'b0;
case(vs2_eew)
EEW8: begin
result_all_valid = ({result_valid8,result_valid16,result_valid32}=='1);
end
EEW16: begin
result_all_valid = ({result_valid16,result_valid32}=='1);
end
EEW32: begin
result_all_valid = (result_valid32=='1);
end
endcase
end
// assign to result_data
always_comb begin
// initial the data
result_data = 'b0;
case(uop_funct3)
OPMVV,
OPMVX: begin
case(uop_funct6.ari_funct6)
VDIVU,
VDIV: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = quotient8[i];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = quotient16[i-`VLENB/2][0 +: `BYTE_WIDTH];
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = quotient32[i-`VLENB*3/4][0 +: `BYTE_WIDTH];
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = quotient16[i];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = quotient32[i-`VLEN/`HWORD_WIDTH/2][0 +: `HWORD_WIDTH];
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
result_data[i*`WORD_WIDTH +: `WORD_WIDTH] = quotient32[i];
end
end
endcase
end
VREMU,
VREM: begin
case(vs2_eew)
EEW8: begin
for (int i=0;i<`VLENB/2;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = remainder8[i];
end
for (int i=`VLENB/2;i<`VLENB*3/4;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = remainder16[i-`VLENB/2][0 +: `BYTE_WIDTH];
end
for (int i=`VLENB*3/4;i<`VLENB;i++) begin
result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = remainder32[i-`VLENB*3/4][0 +: `BYTE_WIDTH];
end
end
EEW16: begin
for (int i=0;i<`VLEN/`HWORD_WIDTH/2;i++) begin
result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = remainder16[i];
end
for (int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i++) begin
result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = remainder32[i-`VLEN/`HWORD_WIDTH/2][0 +: `HWORD_WIDTH];
end
end
EEW32: begin
for (int i=0;i<`VLEN/`WORD_WIDTH;i++) begin
result_data[i*`WORD_WIDTH +: `WORD_WIDTH] = remainder32[i];
end
end
endcase
end
endcase
end
endcase
end
//
// submit result to ROB
//
`ifdef TB_SUPPORT
assign result.uop_pc = div_uop.uop_pc;
`endif
assign result.rob_entry = rob_entry;
assign result.w_data = result_data;
assign result_valid = result_all_valid;
assign result.w_valid = result_all_valid;
assign result.vsaturate = 'b0;
endmodule