1. fix ari decoder for immediate data; 2. fix decoder ctrl; 3. fix addsub and shifter sturate logic; 4. optimize viota instruction for better timing; 5. Double Reservation station depth.

Change-Id: Ic381343f4316c0f79db4533219c52836718a3b93
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
index 61ad83c..334ec1a 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
@@ -1000,58 +1000,133 @@
     round8  = 'b0;
     round16 = 'b0;
     round32 = 'b0;
+    
+    case(uop_funct6.ari_funct6)
+      VAADDU,
+      VASUBU: begin
+        case(vxrm)
+          RNU: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]); 
+            end
 
-    case(vxrm)
-      RNU: begin
-        for(int i=0;i<`VLENB;i=i+1) begin
-          round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]); 
-        end
-        
-        for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
-          round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]); 
-        end
-        for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
-          round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, product32[i][0]); 
-        end
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]); 
+            end
+
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, product32[i][0]); 
+            end
+          end
+          RNE: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1])); 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1])); 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1])); 
+            end
+          end
+          RDN: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = {cout8[i],product8[i][`BYTE_WIDTH-1:1]}; 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = {cout16[i],product16[i][`HWORD_WIDTH-1:1]}; 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = {cout32[i],product32[i][`WORD_WIDTH-1:1]}; 
+            end
+          end
+          ROD: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0])); 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0])); 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0])); 
+            end
+          end
+        endcase
       end
-      RNE: begin
-        for(int i=0;i<`VLENB;i=i+1) begin
-          round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1])); 
-        end
+      VAADD,
+      VASUB: begin
+        case(vxrm)
+          RNU: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+                                      product8[i][`BYTE_WIDTH-1:1]}, product8[i][0]); 
+            end
+            
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+                                        product16[i][`HWORD_WIDTH-1:1]}, product16[i][0]); 
+            end
 
-        for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
-          round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1])); 
-        end
-
-        for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
-          round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1])); 
-        end
-      end
-      RDN: begin
-        for(int i=0;i<`VLENB;i=i+1) begin
-          round8[i] = {cout8[i],product8[i][`BYTE_WIDTH-1:1]}; 
-        end
-
-        for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
-          round16[i] = {cout16[i],product16[i][`HWORD_WIDTH-1:1]}; 
-        end
-
-        for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
-          round32[i] = {cout32[i],product32[i][`WORD_WIDTH-1:1]}; 
-        end
-      end
-      ROD: begin
-        for(int i=0;i<`VLENB;i=i+1) begin
-          round8[i] = f_half_add8({cout8[i],product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0])); 
-        end
-
-        for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
-          round16[i] = f_half_add16({cout16[i],product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0])); 
-        end
-
-        for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
-          round32[i] = f_half_add32({cout32[i],product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0])); 
-        end
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+                                        product32[i][`WORD_WIDTH-1:1]}, product32[i][0]); 
+            end
+          end
+          RNE: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+                                      product8[i][`BYTE_WIDTH-1:1]}, (product8[i][0]&product8[i][1])); 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+                                        product16[i][`HWORD_WIDTH-1:1]}, (product16[i][0]&product16[i][1])); 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+                                        product32[i][`WORD_WIDTH-1:1]}, (product32[i][0]&product32[i][1])); 
+            end
+          end
+          RDN: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = {src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+                           product8[i][`BYTE_WIDTH-1:1]}; 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = {src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+                            product16[i][`HWORD_WIDTH-1:1]}; 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = {src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+                            product32[i][`WORD_WIDTH-1:1]}; 
+            end
+          end
+          ROD: begin
+            for(int i=0;i<`VLENB;i=i+1) begin
+              round8[i] = f_half_add8({src2_data[i][`BYTE_WIDTH-1]^src1_data[i][`BYTE_WIDTH-1]?(!cout8[i]):cout8[i],
+                                        product8[i][`BYTE_WIDTH-1:1]}, ((!product8[i][1])&product8[i][0])); 
+            end
+    
+            for(int i=0;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
+              round16[i] = f_half_add16({src2_data[2*i+1][`BYTE_WIDTH-1]^src1_data[2*i+1][`BYTE_WIDTH-1]?(!cout16[i]):cout16[i],
+                                        product16[i][`HWORD_WIDTH-1:1]}, ((!product16[i][1])&product16[i][0])); 
+            end
+    
+            for(int i=0;i<`VLEN/`WORD_WIDTH;i=i+1) begin
+              round32[i] = f_half_add32({src2_data[4*i+3][`BYTE_WIDTH-1]^src1_data[4*i+3][`BYTE_WIDTH-1]?(!cout32[i]):cout32[i],
+                                        product32[i][`WORD_WIDTH-1:1]}, ((!product32[i][1])&product32[i][0])); 
+            end
+          end
+        endcase
       end
     endcase
   end
@@ -1073,79 +1148,79 @@
             addu_upoverflow[4*j +: 4] = {cout8[4*j+3],cout8[4*j+2],cout8[4*j+1],cout8[4*j]};
 
             add_upoverflow[4*j +: 4] = {
-              ((cout8[4*j+3]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j+2]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j+1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j]  ==1'b1)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b0))};
+              ((product8[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b0))};
 
             add_underoverflow[4*j +: 4] = {
-              ((cout8[4*j+3]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j+2]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j+1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j]  ==1'b0)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b1))};
+              ((product8[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b1))};
             
             subu_underoverflow[4*j +: 4] = {cout8[4*j+3],cout8[4*j+2],cout8[4*j+1],cout8[4*j]};
 
             sub_upoverflow[4*j +: 4] = {
-              ((cout8[4*j+3]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j+2]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j+1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
-              ((cout8[4*j]  ==1'b1)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b1))};
+              ((product8[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+              ((product8[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b1))};
 
             sub_underoverflow[4*j +: 4] = {
-              ((cout8[4*j+3]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j+2]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j+1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
-              ((cout8[4*j]  ==1'b0)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b0))};
+              ((product8[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j+2][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+2][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+2][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+              ((product8[4*j  ][`BYTE_WIDTH-1]==1'b0)&(src2_data[4*j  ][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j  ][`BYTE_WIDTH-1]==1'b0))};
           end
           EEW16: begin
             addu_upoverflow[4*j +: 4] = {cout16[2*j+1],1'b0,cout16[2*j],1'b0};
 
             add_upoverflow[4*j +: 4] = {
-              ((cout16[2*j+1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product16[2*j+1][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
               1'b0,
-              ((cout16[2*j]  ==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+              ((product16[2*j  ][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
               1'b0};
 
             add_underoverflow[4*j +: 4] = {
-              ((cout16[2*j+1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product16[2*j+1][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
               1'b0,
-              ((cout16[2*j]  ==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+              ((product16[2*j  ][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
               1'b0};
 
             subu_underoverflow[4*j +: 4] = {cout16[2*j+1],1'b0,cout16[2*j],1'b0};
 
             sub_upoverflow[4*j +: 4] = {
-              ((cout16[2*j+1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product16[2*j+1][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
               1'b0,
-              ((cout16[2*j]  ==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
+              ((product16[2*j  ][`HWORD_WIDTH-1]==1'b1)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b1)),
               1'b0};
 
             sub_underoverflow[4*j +: 4] = {
-              ((cout16[2*j+1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product16[2*j+1][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
               1'b0,
-              ((cout16[2*j]  ==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
+              ((product16[2*j  ][`HWORD_WIDTH-1]==1'b0)&(src2_data[4*j+1][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+1][`BYTE_WIDTH-1]==1'b0)),
               1'b0};
           end
           EEW32: begin
             addu_upoverflow[4*j +: 4] = {cout32[j],3'b0};
 
             add_upoverflow[4*j +: 4] = {
-              ((cout32[j]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product32[j][`WORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
               3'b0};
 
             add_underoverflow[4*j +: 4] = {
-              ((cout32[j]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product32[j][`WORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
               3'b0};
 
             subu_underoverflow[4*j +: 4] = {cout32[j],3'b0};
 
             sub_upoverflow[4*j +: 4] = {
-              ((cout32[j]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
+              ((product32[j][`WORD_WIDTH-1]==1'b1)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b0)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b1)),
               3'b0};
 
             sub_underoverflow[4*j +: 4] = {
-              ((cout32[j]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
+              ((product32[j][`WORD_WIDTH-1]==1'b0)&(src2_data[4*j+3][`BYTE_WIDTH-1]==1'b1)&(src1_data[4*j+3][`BYTE_WIDTH-1]==1'b0)),
               3'b0};
           end
         endcase
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
index dab4550..d7f1a43 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
@@ -65,18 +65,16 @@
   logic   [`VLEN-1:0]                     result_data_vfirst;
   logic   [`VLEN/16-1:0][4:0]             result_data_vcpop;
   logic   [`VLEN-1:0][$clog2(`VLEN)-1:0]               result_data_viota;
-  logic   [`VLEN/4-1:0][$clog2(`VLEN/4)-1:0]           result_data_viota_pct25;
-  logic   [`VLEN/4-1:0][$clog2(`VLEN/4)-1:0]           result_data_viota_pct50;
-  logic   [`VLEN/2-1:0][$clog2(`VLEN/2)-1:0]           result_data_viota_pct100;
+  logic   [`VLEN-1:0][$clog2(`BYTE_WIDTH):0]           result_data_viota_per8;
   logic   [`VLENB-1:0][$clog2(`VLEN)-1:0]              result_data_viota8;
   logic   [`VLEN/`HWORD_WIDTH-1:0][$clog2(`VLEN)-1:0]  result_data_viota16;
   logic   [`VLEN/`WORD_WIDTH-1:0][$clog2(`VLEN)-1:0]   result_data_viota32;
   logic   [`VLEN-1:0]                     result_data_vid8;
   logic   [`VLEN-1:0]                     result_data_vid16;
   logic   [`VLEN-1:0]                     result_data_vid32;
-  
+
   // for-loop
-  genvar                                  j;
+  genvar                          j;
 
 //
 // prepare source data to calculate    
@@ -226,7 +224,7 @@
     // initial the data
     src2_data       = 'b0;
     src1_data       = 'b0;
-    src2_data_viota = 'b0;
+    src2_data_viota = 'b0; 
 
     // prepare source data
     case(uop_funct3)
@@ -299,9 +297,9 @@
               end
               VIOTA: begin
                 if (vm==1'b1)
-                  src2_data_viota = vs2_data;
+                  src2_data_viota = {vs2_data,1'b0};
                 else
-                  src2_data_viota = vs2_data&v0_data; 
+                  src2_data_viota = {vs2_data&v0_data,1'b0}; 
               end
               // no source operand for VID
             endcase
@@ -352,32 +350,39 @@
   endgenerate
 
   // viota 
-  assign result_data_viota_pct25[0]  = 'b0;
-  assign result_data_viota_pct50[0]  = 'b0;
-  assign result_data_viota_pct100[0] = 'b0;
-
   generate
-    for(j=1; j<`VLEN/4;j++) begin: VIOTA_PCT25
-      assign result_data_viota_pct25[j] = src2_data_viota[j-1]+result_data_viota_pct25[j-1];
+    for(j=0; j<`VLENB;j++) begin: GET_VIOTA_PER8
+      assign {result_data_viota_per8[8*j+7],
+              result_data_viota_per8[8*j+6], 
+              result_data_viota_per8[8*j+5], 
+              result_data_viota_per8[8*j+4], 
+              result_data_viota_per8[8*j+3], 
+              result_data_viota_per8[8*j+2], 
+              result_data_viota_per8[8*j+1], 
+              result_data_viota_per8[8*j]} = f_viota8(src2_data_viota[8*j +: 8]);
     end
 
-    for(j=1; j<`VLEN/4;j++) begin: VIOTA_PCT50
-      assign result_data_viota_pct50[j] = src2_data_viota[`VLEN/4+j-1]+result_data_viota_pct50[j-1];
-    end
-
-    for(j=1; j<`VLEN/2;j++) begin: VIOTA_PCT100
-      assign result_data_viota_pct100[j] = src2_data_viota[`VLEN/2+j-1]+result_data_viota_pct100[j-1];
-    end
-  endgenerate
-
-  generate
-    for(j=0; j<`VLEN;j++) begin: GET_VIOTA
-      if (j<`VLEN/4)
-        assign result_data_viota[j] = result_data_viota_pct25[j];
-      else if (j<`VLEN/2)
-        assign result_data_viota[j] = result_data_viota_pct50[j-`VLEN/4]+result_data_viota_pct25[`VLEN/4-1];
-      else
-        assign result_data_viota[j] = result_data_viota_pct100[j-`VLEN/2]+result_data_viota_pct50[`VLEN/4-1];
+    for(j=0; j<`VLENB;j++) begin: GET_VIOTA
+      if (j==0) begin
+        assign result_data_viota[0] = result_data_viota_per8[0];
+        assign result_data_viota[1] = result_data_viota_per8[1];
+        assign result_data_viota[2] = result_data_viota_per8[2];
+        assign result_data_viota[3] = result_data_viota_per8[3];
+        assign result_data_viota[4] = result_data_viota_per8[4];
+        assign result_data_viota[5] = result_data_viota_per8[5];
+        assign result_data_viota[6] = result_data_viota_per8[6];
+        assign result_data_viota[7] = result_data_viota_per8[7];
+      end
+      else begin
+        assign result_data_viota[8*j  ] = result_data_viota_per8[8*j  ] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+1] = result_data_viota_per8[8*j+1] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+2] = result_data_viota_per8[8*j+2] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+3] = result_data_viota_per8[8*j+3] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+4] = result_data_viota_per8[8*j+4] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+5] = result_data_viota_per8[8*j+5] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+6] = result_data_viota_per8[8*j+6] + result_data_viota_per8[8*j-1];
+        assign result_data_viota[8*j+7] = result_data_viota_per8[8*j+7] + result_data_viota_per8[8*j-1];
+      end
     end
   endgenerate
   
@@ -529,9 +534,9 @@
 // submit result to ROB
 //
 `ifdef TB_SUPPORT
-  assign result.uop_pc = alu_uop.uop_pc;
+  assign  result.uop_pc = alu_uop.uop_pc;
 `endif
-  assign result.rob_entry = rob_entry;
+  assign  result.rob_entry = rob_entry;
 
   // result data
   generate 
@@ -602,7 +607,7 @@
 
   // result valid signal
   assign result.w_valid = result_valid;
-
+  
   // saturate signal
   assign result.vsaturate = 'b0;
 
@@ -694,4 +699,62 @@
     f_vmsbf = {1'b0, (src2[`VLEN-1:1]-1) & src2[`VLEN-1:1]};
   endfunction
 
+  // viota
+  function [3:0][2:0] f_viota4;
+    input logic [3:0] src;
+    
+    if (src[0]==1'b1)
+      f_viota4[0] = 'd1;
+    else
+      f_viota4[0] = 'b0;
+      
+    if (src[1:0]==2'b11)
+      f_viota4[1] = 'd2;
+    else if ((src[1:0]==2'b10)|(src[1:0]==2'b01))
+      f_viota4[1] = 'd1;
+    else
+      f_viota4[1] = 'b0;
+
+    if (src[2:0]==3'b111)
+      f_viota4[2] = 'd3;
+    else if ((src[2:0]==3'b011)|(src[2:0]==3'b101)|(src[2:0]==3'b110))
+      f_viota4[2] = 'd2;
+    else if ((src[2:0]==3'b001)|(src[2:0]==3'b010)|(src[2:0]==3'b100))
+      f_viota4[2] = 'd1;
+    else
+      f_viota4[2] = 'b0;
+
+    if (src[3:0]==4'b1111)
+      f_viota4[3] = 'd4;
+    else if ((src[3:0]==4'b0111)|(src[3:0]==4'b1011)|(src[3:0]==4'b1101)|(src[3:0]==4'b1110))
+      f_viota4[3] = 'd3;
+    else if ((src[3:0]==4'b0011)|(src[3:0]==4'b0101)|(src[3:0]==4'b1001)|(src[3:0]==4'b0110)|(src[3:0]==4'b1010)|(src[3:0]==4'b1100))
+      f_viota4[3] = 'd2;
+    else if ((src[3:0]==4'b0001)|(src[3:0]==4'b0010)|(src[3:0]==4'b0100)|(src[3:0]==4'b1000))
+      f_viota4[3] = 'd1;
+    else
+      f_viota4[3] = 'b0;
+  
+  endfunction
+
+  function [7:0][3:0] f_viota8;
+    input logic [7:0] src;
+
+    logic [3:0][2:0] viota4_lo;
+    logic [3:0][2:0] viota4_hi;
+    
+    viota4_lo = f_viota4(src[3:0]);
+    viota4_hi = f_viota4(src[7:4]);
+
+    f_viota8[0] = viota4_lo[0];
+    f_viota8[1] = viota4_lo[1];
+    f_viota8[2] = viota4_lo[2];
+    f_viota8[3] = viota4_lo[3];
+    f_viota8[4] = viota4_hi[0]+viota4_lo[3];
+    f_viota8[5] = viota4_hi[1]+viota4_lo[3];
+    f_viota8[6] = viota4_hi[2]+viota4_lo[3];
+    f_viota8[7] = viota4_hi[3]+viota4_lo[3];
+
+  endfunction
+
 endmodule
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
index 51eb375..705b6f6 100755
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
@@ -668,34 +668,136 @@
           
         case(vs2_eew)
           EEW16: begin
-            // unsigned overflow check for vnclipu
-            if (opcode == SHIFT_SRL) begin
-              upoverflow[4*j +: 4] = {
-                ({cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),1'b0,
-                ({cout16[2*j],   round16[2*j][  `BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),1'b0};
-            end
-            else if (opcode == SHIFT_SRA) begin
-            // signed overflow check for vnclip
-              upoverflow[4*j +: 4] = {
-                ({cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[2*j+1][`BYTE_WIDTH-1]==1'b0),1'b0,
-                ({cout16[2*j],   round16[2*j][  `BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[2*j][  `BYTE_WIDTH-1]==1'b0),1'b0};
-
-              underoverflow[4*j +: 4] = {
-                ((&{cout16[2*j+1], round16[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]})!=1'b1)&(round16[2*j+1][`BYTE_WIDTH-1]==1'b1),1'b0,
-                ((&{cout16[2*j],   round16[2*j][  `BYTE_WIDTH +: `BYTE_WIDTH]})!=1'b1)&(round16[2*j][  `BYTE_WIDTH-1]==1'b1),1'b0};
-            end
+            case(opcode)
+              SHIFT_SRL: begin
+              // unsigned overflow check for vnclipu
+                if(uop_index[0]==1'b0) begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*j  ], round16[4*j  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)};
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = 'b0;
+                  end
+                end
+                else begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = 'b0;
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)  ], round16[4*(j-`VLEN/`WORD_WIDTH/2)  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)};
+                  end
+                end
+              end
+              SHIFT_SRA: begin
+              // signed overflow check for vnclip
+                if(uop_index[0]==1'b0) begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+3][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+2][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j+1][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*j  ], round16[4*j  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*j  ][`BYTE_WIDTH-1]==1'b0)};
+  
+                    underoverflow[4*j +: 4] = {
+                      ({cout16[4*j+3], round16[4*j+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+3][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*j+2], round16[4*j+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+2][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*j+1], round16[4*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j+1][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*j  ], round16[4*j  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*j  ][`BYTE_WIDTH-1]==1'b1)};
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = 'b0;
+                    underoverflow[4*j +: 4] = 'b0;
+                  end
+                end
+                else begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = 'b0;
+                    underoverflow[4*j +: 4] = 'b0;
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH-1]==1'b0),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)  ], round16[4*(j-`VLEN/`WORD_WIDTH/2)  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)  ][`BYTE_WIDTH-1]==1'b0)};
+  
+                    underoverflow[4*j +: 4] = {
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+3], round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+3][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+2], round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+2][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)+1], round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)+1][`BYTE_WIDTH-1]==1'b1),
+                      ({cout16[4*(j-`VLEN/`WORD_WIDTH/2)  ], round16[4*(j-`VLEN/`WORD_WIDTH/2)  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round16[4*(j-`VLEN/`WORD_WIDTH/2)  ][`BYTE_WIDTH-1]==1'b1)};
+                  end
+                end
+              end
+            endcase
           end
           EEW32: begin
             // unsigned overflow check for vnclipu
-            if (opcode == SHIFT_SRL) begin
-              upoverflow[4*j +: 4] = {({cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0), 3'b0};
-            end
-            else if (opcode == SHIFT_SRA) begin
-            // signed overflow check for vnclip
-              upoverflow[4*j +: 4] = {({cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[j][`HWORD_WIDTH-1]==1'b0), 3'b0};
-
-              underoverflow[4*j +: 4] = {((&{cout32[j], round32[j][`HWORD_WIDTH +: `HWORD_WIDTH]})!=1'b1)&(round32[j][`HWORD_WIDTH-1]==1'b1), 3'b0};
-            end
+            case(opcode)
+              SHIFT_SRL: begin
+              // unsigned overflow check for vnclipu
+                if(uop_index[0]==1'b0) begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout32[2*j+1], round32[2*j+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0,
+                      ({cout32[2*j  ], round32[2*j  ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0};
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = 'b0;
+                  end
+                end
+                else begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = 'b0;
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0,
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)  ], round32[2*(j-`VLEN/`WORD_WIDTH/2)  ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0),1'b0};
+                  end
+                end
+              end
+              SHIFT_SRA: begin
+                if(uop_index[0]==1'b0) begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout32[2*j+1], round32[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round32[2*j+1][`BYTE_WIDTH-1]==1'b0),1'b0,
+                      ({cout32[2*j  ], round32[2*j  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='b0)&(round32[2*j  ][`BYTE_WIDTH-1]==1'b0),1'b0};
+  
+                    underoverflow[4*j +: 4] = {
+                      ({cout32[2*j+1], round32[2*j+1][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round32[2*j+1][`BYTE_WIDTH-1]==1'b1),1'b0,
+                      ({cout32[2*j  ], round32[2*j  ][`BYTE_WIDTH +: `BYTE_WIDTH]}!='1)&(round32[2*j  ][`BYTE_WIDTH-1]==1'b1),1'b0};
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = 'b0;
+                    underoverflow[4*j +: 4] = 'b0;
+                  end
+                end
+                else begin
+                  if(j<`VLEN/`WORD_WIDTH/2) begin
+                    upoverflow[4*j +: 4] = 'b0;
+                    underoverflow[4*j +: 4] = 'b0;
+                  end
+                  else begin
+                    upoverflow[4*j +: 4] = {
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH-1]==1'b0),1'b0,
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)  ], round32[2*(j-`VLEN/`WORD_WIDTH/2)  ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='b0)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)  ][`HWORD_WIDTH-1]==1'b0),1'b0};
+  
+                    underoverflow[4*j +: 4] = {
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)+1], round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH +: `HWORD_WIDTH]}!='1)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)+1][`HWORD_WIDTH-1]==1'b1),1'b0,
+                      ({cout32[2*(j-`VLEN/`WORD_WIDTH/2)  ], round32[2*(j-`VLEN/`WORD_WIDTH/2)  ][`HWORD_WIDTH +: `HWORD_WIDTH]}!='1)&(round32[2*(j-`VLEN/`WORD_WIDTH/2)  ][`HWORD_WIDTH-1]==1'b1),1'b0};
+                  end
+                end
+              end
+            endcase
           end
         endcase
       end
@@ -991,7 +1093,7 @@
     
     logic [`BYTE_WIDTH:0] result;
 
-    result = src_x +cin;
+    result = cin ? src_x + 1'b1 : src_x;
 
     f_half_add8 = result[`BYTE_WIDTH-1:0];
   endfunction
@@ -1001,7 +1103,7 @@
     input logic [`HWORD_WIDTH:0] src_x;
     input logic                  cin;
 
-    f_half_add16 = src_x + cin;
+    f_half_add16 = cin ? src_x + 1'b1 : src_x;
   endfunction
 
   function [`WORD_WIDTH:0] f_half_add32;
@@ -1009,7 +1111,7 @@
     input logic [`WORD_WIDTH:0] src_x;
     input logic                 cin;
 
-    f_half_add32 = src_x + cin;
+    f_half_add32 = cin ? src_x + 1'b1 : src_x;
   endfunction
 
 endmodule
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv b/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
index 886ff0f..388a50a 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_ctrl.sv
@@ -157,9 +157,21 @@
     
     case(1'b1)
       uop_index_enable_unit0: 
-        uop_index_din = uop_index_remain + 'd4;    
-      uop_index_enable_unit1:
-        uop_index_din = 'd4 - quantity; 
+        uop_index_din = uop_de2uq[0][`NUM_DE_UOP-1].uop_index + 1'b1;    
+      uop_index_enable_unit1: begin
+        case(quantity)
+          'd0:
+            uop_index_din = uop_de2uq[1][3].uop_index + 1'b1; 
+          'd1:
+            uop_index_din = uop_de2uq[1][2].uop_index + 1'b1; 
+          'd2:
+            uop_index_din = uop_de2uq[1][1].uop_index + 1'b1; 
+          'd3:
+            uop_index_din = uop_de2uq[1][0].uop_index + 1'b1; 
+          'd4:
+            uop_index_din = uop_de2uq[1][0].uop_index; 
+        endcase
+      end
     endcase
   end
 
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
index 26807cd..0cd9714 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
@@ -5069,17 +5069,18 @@
             VRSUB,
             VADC,
             VMADC,
-            VSBC,
             VAND,
             VOR,
             VXOR,
             VMSEQ,
             VMSNE,
+            VMSLEU,
             VMSLE,
+            VMSGTU,
             VMSGT,
             VMERGE_VMV,
-            VSADD,
-            VNCLIP: begin
+            VSADDU,
+            VSADD: begin
               case(inst_funct3)
                 OPIVX: begin
                   uop[i].rs1_data       = rs1_data;
@@ -5093,13 +5094,14 @@
             end
           
             VSUB,
+            VSBC,
             VMSBC,
             VMSLTU,
             VMSLT,
-            VMIN,
-            VMAX,
             VMINU,
+            VMIN,
             VMAXU,
+            VMAX,
             VSSUBU,
             VSSUB,
             VSMUL_VMVNRR: begin
@@ -5116,12 +5118,10 @@
             VSRA,
             VNSRL,
             VNSRA,
-            VMSLEU,
-            VMSGTU,
-            VSADDU,
             VSSRL,
             VSSRA,
             VNCLIPU,
+            VNCLIP,
             VSLIDEUP_RGATHEREI16,
             VSLIDEDOWN,
             VRGATHER: begin
diff --git a/hdl/verilog/rvv/inc/rvv_backend_define.svh b/hdl/verilog/rvv/inc/rvv_backend_define.svh
index dea08ab..2801cdb 100755
--- a/hdl/verilog/rvv/inc/rvv_backend_define.svh
+++ b/hdl/verilog/rvv/inc/rvv_backend_define.svh
@@ -29,11 +29,11 @@
 // the depth of queue/station/buffer
 `define CQ_DEPTH                8
 `define UQ_DEPTH                16
-`define ALU_RS_DEPTH            2
+`define ALU_RS_DEPTH            4
 `define PMTRDT_RS_DEPTH         8
-`define MUL_RS_DEPTH            2
-`define DIV_RS_DEPTH            2
-`define LSU_RS_DEPTH            2
+`define MUL_RS_DEPTH            4
+`define DIV_RS_DEPTH            4
+`define LSU_RS_DEPTH            4
 `define ROB_DEPTH               8
 `define ROB_DEPTH_WIDTH         $clog2(`ROB_DEPTH)