1. fix ari decoder for immediate data; 2. fix decoder ctrl; 3. fix addsub and shifter sturate logic; 4. optimize viota instruction for better timing; 5. Double Reservation station depth.
Change-Id: Ic381343f4316c0f79db4533219c52836718a3b93
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
index 61ad83c..334ec1a 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
@@ -1000,58 +1000,133 @@
round8 = 'b0;
round16 = 'b0;
round32 = 'b0;
+
+ case(uop_funct6.ari_funct6)
+ VAADDU,
+ VASUBU: begin
+ case(vxrm)
+ RNU: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]);
+ end
- case(vxrm)
- RNU: begin
- for(int i=0;i<`VLENB;i=i+1) begin
- round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]);
- end
-
- for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
- round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]);
- end
- for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
- round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, product32[i][0]);
- end
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]);
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, product32[i][0]);
+ end
+ end
+ RNE: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1]));
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1]));
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1]));
+ end
+ end
+ RDN: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = {cout8[i],product8[i][`BYTE_WIDTH-1:1]};
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = {cout16[i],product16[i][`HWORD_WIDTH-1:1]};
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = {cout32[i],product32[i][`WORD_WIDTH-1:1]};
+ end
+ end
+ ROD: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0]));
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0]));
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0]));
+ end
+ end
+ endcase
end
- RNE: begin
- for(int i=0;i<`VLENB;i=i+1) begin
- round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1]));
- end
+ VAADD,
+ VASUB: begin
+ case(vxrm)
+ RNU: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+ product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]);
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+ product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]);
+ end
- for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
- round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1]));
- end
-
- for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
- round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1]));
- end
- end
- RDN: begin
- for(int i=0;i<`VLENB;i=i+1) begin
- round8[i] = {cout8[i],product8[i][`BYTE_WIDTH-1:1]};
- end
-
- for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
- round16[i] = {cout16[i],product16[i][`HWORD_WIDTH-1:1]};
- end
-
- for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
- round32[i] = {cout32[i],product32[i][`WORD_WIDTH-1:1]};
- end
- end
- ROD: begin
- for(int i=0;i<`VLENB;i=i+1) begin
- round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0]));
- end
-
- for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
- round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0]));
- end
-
- for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
- round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0]));
- end
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+ product32[i][`WORD_WIDTH-1:1]}, product32[i][0]);
+ end
+ end
+ RNE: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+ product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1]));
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+ product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1]));
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+ product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1]));
+ end
+ end
+ RDN: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = {src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+ product8[i][`BYTE_WIDTH-1:1]};
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = {src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+ product16[i][`HWORD_WIDTH-1:1]};
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = {src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+ product32[i][`WORD_WIDTH-1:1]};
+ end
+ end
+ ROD: begin
+ for(int i=0;i<`VLENB;i=i+1) begin
+ round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+ product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0]));
+ end
+
+ for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+ round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+ product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0]));
+ end
+
+ for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+ round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+ product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0]));
+ end
+ end
+ endcase
end
endcase
end
@@ -1073,79 +1148,79 @@
addu_upoverflow[4*j +: 4] = {cout8[4*j+3],cout8[4*j+2],cout8[4*j+1],cout8[4*j]};
add_upoverflow[4*j +: 4] = {
- ((cout8[4*j+3]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j+2]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j+1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j] ==1'b1)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b0))};
+ ((product8[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j ][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b0))};
add_underoverflow[4*j +: 4] = {
- ((cout8[4*j+3]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j+2]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j+1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j] ==1'b0)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b1))};
+ ((product8[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j ][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b1))};
subu_underoverflow[4*j +: 4] = {cout8[4*j+3],cout8[4*j+2],cout8[4*j+1],cout8[4*j]};
sub_upoverflow[4*j +: 4] = {
- ((cout8[4*j+3]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j+2]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j+1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
- ((cout8[4*j] ==1'b1)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b1))};
+ ((product8[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+ ((product8[4*j ][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b1))};
sub_underoverflow[4*j +: 4] = {
- ((cout8[4*j+3]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j+2]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j+1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
- ((cout8[4*j] ==1'b0)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b0))};
+ ((product8[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+ ((product8[4*j ][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j ][`BYTE_WIDTH-1]==1'b0))};
end
EEW16: begin
addu_upoverflow[4*j +: 4] = {cout16[2*j+1],1'b0,cout16[2*j],1'b0};
add_upoverflow[4*j +: 4] = {
- ((cout16[2*j+1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product16[2*j+1][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
1'b0,
- ((cout16[2*j] ==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+ ((product16[2*j ][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
1'b0};
add_underoverflow[4*j +: 4] = {
- ((cout16[2*j+1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product16[2*j+1][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
1'b0,
- ((cout16[2*j] ==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+ ((product16[2*j ][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
1'b0};
subu_underoverflow[4*j +: 4] = {cout16[2*j+1],1'b0,cout16[2*j],1'b0};
sub_upoverflow[4*j +: 4] = {
- ((cout16[2*j+1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product16[2*j+1][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
1'b0,
- ((cout16[2*j] ==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+ ((product16[2*j ][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
1'b0};
sub_underoverflow[4*j +: 4] = {
- ((cout16[2*j+1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product16[2*j+1][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
1'b0,
- ((cout16[2*j] ==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+ ((product16[2*j ][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
1'b0};
end
EEW32: begin
addu_upoverflow[4*j +: 4] = {cout32[j],3'b0};
add_upoverflow[4*j +: 4] = {
- ((cout32[j]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product32[j][`WORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
3'b0};
add_underoverflow[4*j +: 4] = {
- ((cout32[j]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product32[j][`WORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
3'b0};
subu_underoverflow[4*j +: 4] = {cout32[j],3'b0};
sub_upoverflow[4*j +: 4] = {
- ((cout32[j]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+ ((product32[j][`WORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
3'b0};
sub_underoverflow[4*j +: 4] = {
- ((cout32[j]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+ ((product32[j][`WORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
3'b0};
end
endcase
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
index dab4550..d7f1a43 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
@@ -65,18 +65,16 @@
logic [`VLEN-1:0] result_data_vfirst;
logic [`VLEN/16-1:0][4:0] result_data_vcpop;
logic [`VLEN-1:0][$clog2(`VLEN)-1:0] result_data_viota;
- logic [`VLEN/4-1:0][$clog2(`VLEN/4)-1:0] result_data_viota_pct25;
- logic [`VLEN/4-1:0][$clog2(`VLEN/4)-1:0] result_data_viota_pct50;
- logic [`VLEN/2-1:0][$clog2(`VLEN/2)-1:0] result_data_viota_pct100;
+ logic [`VLEN-1:0][$clog2(`BYTE_WIDTH):0] result_data_viota_per8;
logic [`VLENB-1:0][$clog2(`VLEN)-1:0] result_data_viota8;
logic [`VLEN/`HWORD_WIDTH-1:0][$clog2(`VLEN)-1:0] result_data_viota16;
logic [`VLEN/`WORD_WIDTH-1:0][$clog2(`VLEN)-1:0] result_data_viota32;
logic [`VLEN-1:0] result_data_vid8;
logic [`VLEN-1:0] result_data_vid16;
logic [`VLEN-1:0] result_data_vid32;
-
+
// for-loop
- genvar j;
+ genvar j;
//
// prepare source data to calculate
@@ -226,7 +224,7 @@
// initial the data
src2_data = 'b0;
src1_data = 'b0;
- src2_data_viota = 'b0;
+ src2_data_viota = 'b0;
// prepare source data
case(uop_funct3)
@@ -299,9 +297,9 @@
end
VIOTA: begin
if (vm==1'b1)
- src2_data_viota = vs2_data;
+ src2_data_viota = {vs2_data,1'b0};
else
- src2_data_viota = vs2_data&v0_data;
+ src2_data_viota = {vs2_data&v0_data,1'b0};
end
// no source operand for VID
endcase
@@ -352,32 +350,39 @@
endgenerate
// viota
- assign result_data_viota_pct25[0] = 'b0;
- assign result_data_viota_pct50[0] = 'b0;
- assign result_data_viota_pct100[0] = 'b0;
-
generate
- for(j=1; j<`VLEN/4;j++) begin: VIOTA_PCT25
- assign result_data_viota_pct25[j] = src2_data_viota[j-1]+result_data_viota_pct25[j-1];
+ for(j=0; j<`VLENB;j++) begin: GET_VIOTA_PER8
+ assign {result_data_viota_per8[8*j+7],
+ result_data_viota_per8[8*j+6],
+ result_data_viota_per8[8*j+5],
+ result_data_viota_per8[8*j+4],
+ result_data_viota_per8[8*j+3],
+ result_data_viota_per8[8*j+2],
+ result_data_viota_per8[8*j+1],
+ result_data_viota_per8[8*j]} = f_viota8(src2_data_viota[8*j +: 8]);
end
- for(j=1; j<`VLEN/4;j++) begin: VIOTA_PCT50
- assign result_data_viota_pct50[j] = src2_data_viota[`VLEN/4+j-1]+result_data_viota_pct50[j-1];
- end
-
- for(j=1; j<`VLEN/2;j++) begin: VIOTA_PCT100
- assign result_data_viota_pct100[j] = src2_data_viota[`VLEN/2+j-1]+result_data_viota_pct100[j-1];
- end
- endgenerate
-
- generate
- for(j=0; j<`VLEN;j++) begin: GET_VIOTA
- if (j<`VLEN/4)
- assign result_data_viota[j] = result_data_viota_pct25[j];
- else if (j<`VLEN/2)
- assign result_data_viota[j] = result_data_viota_pct50[j-`VLEN/4]+result_data_viota_pct25[`VLEN/4-1];
- else
- assign result_data_viota[j] = result_data_viota_pct100[j-`VLEN/2]+result_data_viota_pct50[`VLEN/4-1];
+ for(j=0; j<`VLENB;j++) begin: GET_VIOTA
+ if (j==0) begin
+ assign result_data_viota[0] = result_data_viota_per8[0];
+ assign result_data_viota[1] = result_data_viota_per8[1];
+ assign result_data_viota[2] = result_data_viota_per8[2];
+ assign result_data_viota[3] = result_data_viota_per8[3];
+ assign result_data_viota[4] = result_data_viota_per8[4];
+ assign result_data_viota[5] = result_data_viota_per8[5];
+ assign result_data_viota[6] = result_data_viota_per8[6];
+ assign result_data_viota[7] = result_data_viota_per8[7];
+ end
+ else begin
+ assign result_data_viota[8*j ] = result_data_viota_per8[8*j ] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+1] = result_data_viota_per8[8*j+1] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+2] = result_data_viota_per8[8*j+2] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+3] = result_data_viota_per8[8*j+3] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+4] = result_data_viota_per8[8*j+4] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+5] = result_data_viota_per8[8*j+5] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+6] = result_data_viota_per8[8*j+6] + result_data_viota_per8[8*j-1];
+ assign result_data_viota[8*j+7] = result_data_viota_per8[8*j+7] + result_data_viota_per8[8*j-1];
+ end
end
endgenerate
@@ -529,9 +534,9 @@
// submit result to ROB
//
`ifdef TB_SUPPORT
- assign result.uop_pc = alu_uop.uop_pc;
+ assign result.uop_pc = alu_uop.uop_pc;
`endif
- assign result.rob_entry = rob_entry;
+ assign result.rob_entry = rob_entry;
// result data
generate
@@ -602,7 +607,7 @@
// result valid signal
assign result.w_valid = result_valid;
-
+
// saturate signal
assign result.vsaturate = 'b0;
@@ -694,4 +699,62 @@
f_vmsbf = {1'b0, (src2[`VLEN-1:1]-1) & src2[`VLEN-1:1]};
endfunction
+ // viota
+ function [3:0][2:0] f_viota4;
+ input logic [3:0] src;
+
+ if (src[0]==1'b1)
+ f_viota4[0] = 'd1;
+ else
+ f_viota4[0] = 'b0;
+
+ if (src[1:0]==2'b11)
+ f_viota4[1] = 'd2;
+ else if ((src[1:0]==2'b10)|(src[1:0]==2'b01))
+ f_viota4[1] = 'd1;
+ else
+ f_viota4[1] = 'b0;
+
+ if (src[2:0]==3'b111)
+ f_viota4[2] = 'd3;
+ else if ((src[2:0]==3'b011)|(src[2:0]==3'b101)|(src[2:0]==3'b110))
+ f_viota4[2] = 'd2;
+ else if ((src[2:0]==3'b001)|(src[2:0]==3'b010)|(src[2:0]==3'b100))
+ f_viota4[2] = 'd1;
+ else
+ f_viota4[2] = 'b0;
+
+ if (src[3:0]==4'b1111)
+ f_viota4[3] = 'd4;
+ else if ((src[3:0]==4'b0111)|(src[3:0]==4'b1011)|(src[3:0]==4'b1101)|(src[3:0]==4'b1110))
+ f_viota4[3] = 'd3;
+ else if ((src[3:0]==4'b0011)|(src[3:0]==4'b0101)|(src[3:0]==4'b1001)|(src[3:0]==4'b0110)|(src[3:0]==4'b1010)|(src[3:0]==4'b1100))
+ f_viota4[3] = 'd2;
+ else if ((src[3:0]==4'b0001)|(src[3:0]==4'b0010)|(src[3:0]==4'b0100)|(src[3:0]==4'b1000))
+ f_viota4[3] = 'd1;
+ else
+ f_viota4[3] = 'b0;
+
+ endfunction
+
+ function [7:0][3:0] f_viota8;
+ input logic [7:0] src;
+
+ logic [3:0][2:0] viota4_lo;
+ logic [3:0][2:0] viota4_hi;
+
+ viota4_lo = f_viota4(src[3:0]);
+ viota4_hi = f_viota4(src[7:4]);
+
+ f_viota8[0] = viota4_lo[0];
+ f_viota8[1] = viota4_lo[1];
+ f_viota8[2] = viota4_lo[2];
+ f_viota8[3] = viota4_lo[3];
+ f_viota8[4] = viota4_hi[0]+viota4_lo[3];
+ f_viota8[5] = viota4_hi[1]+viota4_lo[3];
+ f_viota8[6] = viota4_hi[2]+viota4_lo[3];
+ f_viota8[7] = viota4_hi[3]+viota4_lo[3];
+
+ endfunction
+
endmodule
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
index 51eb375..705b6f6 100755
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
@@ -668,34 +668,136 @@
case(vs2_eew)
EEW16: begin
- // unsigned overflow check for vnclipu
- if (opcode == SHIFT_SRL) begin
- upoverflow[4*j +: 4] = {
- ({cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),1'b0,
- ({cout16[2*j], round16[2*j][ `BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),1'b0};
- end
- else if (opcode == SHIFT_SRA) begin
- // signed overflow check for vnclip
- upoverflow[4*j +: 4] = {
- ({cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[2*j+1][`BYTE_WIDTH-1]==1'b0),1'b0,
- ({cout16[2*j], round16[2*j][ `BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[2*j][ `BYTE_WIDTH-1]==1'b0),1'b0};
-
- underoverflow[4*j +: 4] = {
- ((&{cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]})!=1'b1)&(round16[2*j+1][`BYTE_WIDTH-1]==1'b1),1'b0,
- ((&{cout16[2*j], round16[2*j][ `BYTE_WIDTH +: `BYTE_WIDTH]})!=1'b1)&(round16[2*j][ `BYTE_WIDTH-1]==1'b1),1'b0};
- end
+ case(opcode)
+ SHIFT_SRL: begin
+ // unsigned overflow check for vnclipu
+ if(uop_index[0]==1'b0) begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = {
+ ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*j ], round16[4*j ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)};
+ end
+ else begin
+ upoverflow[4*j +: 4] = 'b0;
+ end
+ end
+ else begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = 'b0;
+ end
+ else begin
+ upoverflow[4*j +: 4] = {
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2) ], round16[4*(j-`VLEN/`WORD_WIDTH/2) ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)};
+ end
+ end
+ end
+ SHIFT_SRA: begin
+ // signed overflow check for vnclip
+ if(uop_index[0]==1'b0) begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = {
+ ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+3][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+2][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+1][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*j ], round16[4*j ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j ][`BYTE_WIDTH-1]==1'b0)};
+
+ underoverflow[4*j +: 4] = {
+ ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+3][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+2][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+1][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*j ], round16[4*j ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j ][`BYTE_WIDTH-1]==1'b1)};
+ end
+ else begin
+ upoverflow[4*j +: 4] = 'b0;
+ underoverflow[4*j +: 4] = 'b0;
+ end
+ end
+ else begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = 'b0;
+ underoverflow[4*j +: 4] = 'b0;
+ end
+ else begin
+ upoverflow[4*j +: 4] = {
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH-1]==1'b0),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2) ], round16[4*(j-`VLEN/`WORD_WIDTH/2) ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2) ][`BYTE_WIDTH-1]==1'b0)};
+
+ underoverflow[4*j +: 4] = {
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH-1]==1'b1),
+ ({cout16[4*(j-`VLEN/`WORD_WIDTH/2) ], round16[4*(j-`VLEN/`WORD_WIDTH/2) ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2) ][`BYTE_WIDTH-1]==1'b1)};
+ end
+ end
+ end
+ endcase
end
EEW32: begin
// unsigned overflow check for vnclipu
- if (opcode == SHIFT_SRL) begin
- upoverflow[4*j +: 4] = {({cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0), 3'b0};
- end
- else if (opcode == SHIFT_SRA) begin
- // signed overflow check for vnclip
- upoverflow[4*j +: 4] = {({cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[j][`HWORD_WIDTH-1]==1'b0), 3'b0};
-
- underoverflow[4*j +: 4] = {((&{cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]})!=1'b1)&(round32[j][`HWORD_WIDTH-1]==1'b1), 3'b0};
- end
+ case(opcode)
+ SHIFT_SRL: begin
+ // unsigned overflow check for vnclipu
+ if(uop_index[0]==1'b0) begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = {
+ ({cout32[2*j+1], round32[2*j+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0,
+ ({cout32[2*j ], round32[2*j ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0};
+ end
+ else begin
+ upoverflow[4*j +: 4] = 'b0;
+ end
+ end
+ else begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = 'b0;
+ end
+ else begin
+ upoverflow[4*j +: 4] = {
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0,
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2) ], round32[2*(j-`VLEN/`WORD_WIDTH/2) ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0};
+ end
+ end
+ end
+ SHIFT_SRA: begin
+ if(uop_index[0]==1'b0) begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = {
+ ({cout32[2*j+1], round32[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round32[2*j+1][`BYTE_WIDTH-1]==1'b0),1'b0,
+ ({cout32[2*j ], round32[2*j ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round32[2*j ][`BYTE_WIDTH-1]==1'b0),1'b0};
+
+ underoverflow[4*j +: 4] = {
+ ({cout32[2*j+1], round32[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round32[2*j+1][`BYTE_WIDTH-1]==1'b1),1'b0,
+ ({cout32[2*j ], round32[2*j ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round32[2*j ][`BYTE_WIDTH-1]==1'b1),1'b0};
+ end
+ else begin
+ upoverflow[4*j +: 4] = 'b0;
+ underoverflow[4*j +: 4] = 'b0;
+ end
+ end
+ else begin
+ if(j<`VLEN/`WORD_WIDTH/2) begin
+ upoverflow[4*j +: 4] = 'b0;
+ underoverflow[4*j +: 4] = 'b0;
+ end
+ else begin
+ upoverflow[4*j +: 4] = {
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH-1]==1'b0),1'b0,
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2) ], round32[2*(j-`VLEN/`WORD_WIDTH/2) ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[2*(j-`VLEN/`WORD_WIDTH/2) ][`HWORD_WIDTH-1]==1'b0),1'b0};
+
+ underoverflow[4*j +: 4] = {
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='1)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH-1]==1'b1),1'b0,
+ ({cout32[2*(j-`VLEN/`WORD_WIDTH/2) ], round32[2*(j-`VLEN/`WORD_WIDTH/2) ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='1)&(round32[2*(j-`VLEN/`WORD_WIDTH/2) ][`HWORD_WIDTH-1]==1'b1),1'b0};
+ end
+ end
+ end
+ endcase
end
endcase
end
@@ -991,7 +1093,7 @@
logic [`BYTE_WIDTH:0] result;
- result = src_x +cin;
+ result = cin ? src_x + 1'b1 : src_x;
f_half_add8 = result[`BYTE_WIDTH-1:0];
endfunction
@@ -1001,7 +1103,7 @@
input logic [`HWORD_WIDTH:0] src_x;
input logic cin;
- f_half_add16 = src_x + cin;
+ f_half_add16 = cin ? src_x + 1'b1 : src_x;
endfunction
function [`WORD_WIDTH:0] f_half_add32;
@@ -1009,7 +1111,7 @@
input logic [`WORD_WIDTH:0] src_x;
input logic cin;
- f_half_add32 = src_x + cin;
+ f_half_add32 = cin ? src_x + 1'b1 : src_x;
endfunction
endmodule
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv b/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
index 886ff0f..388a50a 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
@@ -157,9 +157,21 @@
case(1'b1)
uop_index_enable_unit0:
- uop_index_din = uop_index_remain + 'd4;
- uop_index_enable_unit1:
- uop_index_din = 'd4 - quantity;
+ uop_index_din = uop_de2uq[0][`NUM_DE_UOP-1].uop_index + 1'b1;
+ uop_index_enable_unit1: begin
+ case(quantity)
+ 'd0:
+ uop_index_din = uop_de2uq[1][3].uop_index + 1'b1;
+ 'd1:
+ uop_index_din = uop_de2uq[1][2].uop_index + 1'b1;
+ 'd2:
+ uop_index_din = uop_de2uq[1][1].uop_index + 1'b1;
+ 'd3:
+ uop_index_din = uop_de2uq[1][0].uop_index + 1'b1;
+ 'd4:
+ uop_index_din = uop_de2uq[1][0].uop_index;
+ endcase
+ end
endcase
end
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
index 26807cd..0cd9714 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
@@ -5069,17 +5069,18 @@
VRSUB,
VADC,
VMADC,
- VSBC,
VAND,
VOR,
VXOR,
VMSEQ,
VMSNE,
+ VMSLEU,
VMSLE,
+ VMSGTU,
VMSGT,
VMERGE_VMV,
- VSADD,
- VNCLIP: begin
+ VSADDU,
+ VSADD: begin
case(inst_funct3)
OPIVX: begin
uop[i].rs1_data = rs1_data;
@@ -5093,13 +5094,14 @@
end
VSUB,
+ VSBC,
VMSBC,
VMSLTU,
VMSLT,
- VMIN,
- VMAX,
VMINU,
+ VMIN,
VMAXU,
+ VMAX,
VSSUBU,
VSSUB,
VSMUL_VMVNRR: begin
@@ -5116,12 +5118,10 @@
VSRA,
VNSRL,
VNSRA,
- VMSLEU,
- VMSGTU,
- VSADDU,
VSSRL,
VSSRA,
VNCLIPU,
+ VNCLIP,
VSLIDEUP_RGATHEREI16,
VSLIDEDOWN,
VRGATHER: begin
diff --git a/hdl/verilog/rvv/inc/rvv_backend_define.svh b/hdl/verilog/rvv/inc/rvv_backend_define.svh
index dea08ab..2801cdb 100755
--- a/hdl/verilog/rvv/inc/rvv_backend_define.svh
+++ b/hdl/verilog/rvv/inc/rvv_backend_define.svh
@@ -29,11 +29,11 @@
// the depth of queue/station/buffer
`define CQ_DEPTH 8
`define UQ_DEPTH 16
-`define ALU_RS_DEPTH 2
+`define ALU_RS_DEPTH 4
`define PMTRDT_RS_DEPTH 8
-`define MUL_RS_DEPTH 2
-`define DIV_RS_DEPTH 2
-`define LSU_RS_DEPTH 2
+`define MUL_RS_DEPTH 4
+`define DIV_RS_DEPTH 4
+`define LSU_RS_DEPTH 4
`define ROB_DEPTH 8
`define ROB_DEPTH_WIDTH $clog2(`ROB_DEPTH)