Clean lint warning: W416b,W362,W486...

Change-Id: Ie5350b57c334c856fee8815411ea03960649bfa5
diff --git a/hdl/verilog/rvv/common/multi_fifo.sv b/hdl/verilog/rvv/common/multi_fifo.sv
index a8be9ed..16deccf 100644
--- a/hdl/verilog/rvv/common/multi_fifo.sv
+++ b/hdl/verilog/rvv/common/multi_fifo.sv
@@ -121,7 +121,7 @@
   endgenerate
   // dataout
   always_comb begin
-    pop_count = pop[0];
+    pop_count = {(DEPTH_BITS)'(0), pop[0]};
     for (int j=1; j<N; j++) pop_count = pop_count + pop[j];
   end
   
@@ -146,7 +146,9 @@
           always_ff @(posedge clk) begin
             if ((i<remain_count)&(|pop))
               dataout[i] <= mem[current_rptr_mem[i]]; 
-            else if ((push_seq[current_rptr_psh[i]]&(current_rptr_psh[i]<M))&((|pop)|(|push_seq)))
+            else if ((push_seq[current_rptr_psh[i]]&(current_rptr_psh[i]<(DEPTH_BITS)'(M)))&
+                     ((|pop)|(|push_seq))
+                    )
               dataout[i] <= datain_seq[current_rptr_psh[i]];
           end
         end
@@ -155,7 +157,9 @@
           always_ff @(posedge clk) begin
             if ((i<remain_count)&(|pop))
               dataout[i] <= mem[current_rptr_mem[i]]; 
-            else if ((push[current_rptr_psh[i]]&(current_rptr_psh[i]<M))&((|pop)|(|push)))
+            else if ((push[current_rptr_psh[i]]&(current_rptr_psh[i]<(DEPTH_BITS)'(M)))&
+                     ((|pop)|(|push))
+                    )
               dataout[i] <= datain[current_rptr_psh[i]];
           end
         end
@@ -170,7 +174,7 @@
 
   // datain
   always_comb begin
-    push_count = push[0];
+    push_count = {(DEPTH_BITS)'(0), push[0]};
     for (int j=1; j<M; j++) push_count = push_count + push[j];
   end
 
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
index 2d6aa0f..cc35a7e 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_addsub.sv
@@ -1674,9 +1674,9 @@
     logic                         cout;
 
     if (opcode==ADDSUB_VADD) 
-      {cout,result} = src_x + src_y + src_cin;
+      {cout,result} = (`BYTE_WIDTH+1)'(src_x) + (`BYTE_WIDTH+1)'(src_y) + (`BYTE_WIDTH+1)'(src_cin);
     else //(opcode==ADDSUB_VSUB)
-      {cout,result} = src_x - src_y - src_cin;
+      {cout,result} = (`BYTE_WIDTH+1)'(src_x) - (`BYTE_WIDTH+1)'(src_y) - (`BYTE_WIDTH+1)'(src_cin);
     
     return {cout,result};
 
@@ -1726,7 +1726,7 @@
     logic [`HWORD_WIDTH:0]   res_lo;
 
     res_hi = src_x[`WORD_WIDTH-1:`HWORD_WIDTH] + 1'b1;
-    res_lo = src_x[`HWORD_WIDTH-1:0] + 1'b1;
+    res_lo = (`HWORD_WIDTH+1)'(src_x[`HWORD_WIDTH-1:0]) + 1'b1;
     
     if (res_lo[`HWORD_WIDTH])
       return {res_hi,res_lo[`HWORD_WIDTH-1:0]};
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_execution_p1.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_execution_p1.sv
index 4251810..dbc75bc 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_execution_p1.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_execution_p1.sv
@@ -32,8 +32,8 @@
   generate
     if(`VLEN==128) begin
       for(j=0;j<64;j++) begin: GET_VIOTA128
-        assign result_data_viota[j] = alu_uop.data_viota_per64[0][j];
-        assign result_data_viota[j+64] = {1'b0,alu_uop.data_viota_per64[1][j]} + {1'b0,alu_uop.data_viota_per64[0][63]};
+        assign result_data_viota[j] = ($clog2(`VLEN)+1)'(alu_uop.data_viota_per64[0][j]);
+        assign result_data_viota[j+64] = ($clog2(`VLEN)+1)'({1'b0,alu_uop.data_viota_per64[1][j]} + {1'b0,alu_uop.data_viota_per64[0][63]});
       end
     end
   endgenerate
@@ -53,7 +53,7 @@
   endgenerate
   
   // vcpop
-  assign result_data_vcpop = result_data_viota[`VLEN-1];
+  assign result_data_vcpop = (`XLEN)'(result_data_viota[`VLEN-1]);
 
 //
 // submit result to ROB
@@ -72,7 +72,7 @@
     // calculate result data
     case(alu_uop.alu_sub_opcode)
       OP_VCPOP: begin
-        result.w_data = result_data_vcpop;
+        result.w_data = (`VLEN)'(result_data_vcpop);
         result.vsaturate = 'b0;
       end
       OP_VIOTA: begin
@@ -81,17 +81,17 @@
         case(alu_uop.vd_eew)
           EEW8: begin
             for(int i=0; i<`VLENB;i++) begin
-              result.w_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = result_data_viota8[i];
+              result.w_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = (`BYTE_WIDTH)'(result_data_viota8[i]);
             end
           end
           EEW16: begin
             for(int i=0; i<`VLEN/`HWORD_WIDTH;i++) begin
-              result.w_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = result_data_viota16[i];
+              result.w_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = (`HWORD_WIDTH)'(result_data_viota16[i]);
             end
           end
           EEW32: begin
             for(int i=0; i<`VLEN/`WORD_WIDTH;i++) begin
-              result.w_data[i*`WORD_WIDTH +: `WORD_WIDTH] = result_data_viota32[i];
+              result.w_data[i*`WORD_WIDTH +: `WORD_WIDTH] = (`WORD_WIDTH)'(result_data_viota32[i]);
             end
           end
         endcase
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
index a6d88fb..256d6e3 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask.sv
@@ -191,11 +191,11 @@
                 result_valid = alu_uop_valid&(vs1_data_valid==1'b0)&vs2_data_valid&((vm==1'b1)||((vm==1'b0)&v0_data_valid));
                 alu_sub_opcode = OP_VIOTA;
                 // it can get the viota result in one cycle whose element index in vd belongs to 0-31.
-                // Otherwise, it will get the result in next cycle.
+                // Otherwise, it will get the result in the next cycle.
                 case(vd_eew)
-                  EEW8   : result_2cycle = uop_index >= 32/(`VLEN/8);
-                  EEW16  : result_2cycle = uop_index >= 32/(`VLEN/16);
-                  default: result_2cycle = uop_index >= 32/(`VLEN/32);  //EEW32
+                  EEW8   : result_2cycle = uop_index >= (`UOP_INDEX_WIDTH)'(32/(`VLEN/8));
+                  EEW16  : result_2cycle = uop_index >= (`UOP_INDEX_WIDTH)'(32/(`VLEN/16));
+                  default: result_2cycle = uop_index >= (`UOP_INDEX_WIDTH)'(32/(`VLEN/32));  //EEW32
                 endcase
               end
               VID: begin
@@ -330,7 +330,7 @@
     else begin
       for(int i=0;i<`VLEN;i++) begin
         if (result_data_vmsof[i]==1'b1)
-          result_data_vfirst = i;         // one-hot to 8421BCD. get the index of first 1
+          result_data_vfirst = (`VLEN)'(i);         // one-hot to 8421BCD. get the index of first 1
       end
     end
   end
@@ -347,24 +347,24 @@
     end
 
     for(j=0; j<`VLENB;j++) begin: GET_VIOTA8
-      if ($clog2(32/`VLENB)<=3) // There may be up to 8 uops, so RHS in if-condition is $clog2(8)=3
-        assign result_data_viota8[j] = data_viota_per32[0][{alu_uop.uop_index[$clog2(32/`VLENB)-1:0],j[$clog2(`VLENB)-1:0]}];
+      if ((3)'($clog2(32/`VLENB)) <= 3'd3) // There may be up to 8 uops, so RHS in if-condition is $clog2(8)=3
+        assign result_data_viota8[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[$clog2(32/`VLENB)-1:0],j[$clog2(`VLENB)-1:0]}]);
       else
-        assign result_data_viota8[j] = data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLENB)-1:0]}];
+        assign result_data_viota8[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLENB)-1:0]}]);
     end
 
     for(j=0; j<`VLEN/`HWORD_WIDTH;j++) begin: GET_VIOTA16
-      if ($clog2(32/(`VLEN/`HWORD_WIDTH))<=3)
-        assign result_data_viota16[j] = data_viota_per32[0][{alu_uop.uop_index[$clog2(32/(`VLEN/`HWORD_WIDTH))-1:0],j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]}];
+      if ((3)'($clog2(32/(`VLEN/`HWORD_WIDTH))) <= 3'd3)
+        assign result_data_viota16[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[$clog2(32/(`VLEN/`HWORD_WIDTH))-1:0],j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]}]);
       else
-        assign result_data_viota16[j] = data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]}];
+        assign result_data_viota16[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]}]);
     end
 
     for(j=0; j<`VLEN/`WORD_WIDTH;j++) begin: GET_VIOTA32
-      if ($clog2(32/(`VLEN/`WORD_WIDTH))<=3)
-        assign result_data_viota32[j] = data_viota_per32[0][{alu_uop.uop_index[$clog2(32/(`VLEN/`WORD_WIDTH))-1:0],j[$clog2(`VLEN/`WORD_WIDTH)-1:0]}];
+      if ((3)'($clog2(32/(`VLEN/`WORD_WIDTH))) <= 3'd3)
+        assign result_data_viota32[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[$clog2(32/(`VLEN/`WORD_WIDTH))-1:0],j[$clog2(`VLEN/`WORD_WIDTH)-1:0]}]);
       else
-        assign result_data_viota32[j] = data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLEN/`WORD_WIDTH)-1:0]}];
+        assign result_data_viota32[j] = ($clog2(`VLEN)+1)'(data_viota_per32[0][{alu_uop.uop_index[2:0],j[$clog2(`VLEN/`WORD_WIDTH)-1:0]}]);
     end
 
     for(j=0;j<`VLEN/64;j++) begin: GET_VIOTA_PER64_J
@@ -378,19 +378,19 @@
   // vid
   generate
     for(j=0;j<`VLENB;j++) begin: GET_VID8
-      assign result_data_vid8[j*`BYTE_WIDTH +: `BYTE_WIDTH] = {uop_index, j[$clog2(`VLENB)-1:0]};
+      assign result_data_vid8[j*`BYTE_WIDTH +: `BYTE_WIDTH] = (`BYTE_WIDTH)'({uop_index, j[$clog2(`VLENB)-1:0]});
     end
   endgenerate
 
   generate
     for(j=0;j<`VLEN/`HWORD_WIDTH;j++) begin: GET_VID16
-      assign result_data_vid16[j*`HWORD_WIDTH +: `HWORD_WIDTH] = {uop_index, j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]};
+      assign result_data_vid16[j*`HWORD_WIDTH +: `HWORD_WIDTH] = (`HWORD_WIDTH)'({uop_index, j[$clog2(`VLEN/`HWORD_WIDTH)-1:0]});
     end
   endgenerate
 
   generate
     for(j=0;j<`VLEN/`WORD_WIDTH;j++) begin: GET_VID32
-      assign result_data_vid32[j*`WORD_WIDTH +: `WORD_WIDTH] = {uop_index, j[$clog2(`VLEN/`WORD_WIDTH)-1:0]};
+      assign result_data_vid32[j*`WORD_WIDTH +: `WORD_WIDTH] = (`WORD_WIDTH)'({uop_index, j[$clog2(`VLEN/`WORD_WIDTH)-1:0]});
     end
   endgenerate
 
@@ -464,17 +464,17 @@
                 case(vd_eew)
                   EEW8: begin
                     for(int i=0; i<`VLENB;i++) begin
-                      result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = result_data_viota8[i];
+                      result_data[i*`BYTE_WIDTH +: `BYTE_WIDTH] = (`BYTE_WIDTH)'(result_data_viota8[i]);
                     end
                   end
                   EEW16: begin
                     for(int i=0; i<`VLEN/`HWORD_WIDTH;i++) begin
-                      result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = result_data_viota16[i];
+                      result_data[i*`HWORD_WIDTH +: `HWORD_WIDTH] = (`HWORD_WIDTH)'(result_data_viota16[i]);
                     end
                   end
                   EEW32: begin
                     for(int i=0; i<`VLEN/`WORD_WIDTH;i++) begin
-                      result_data[i*`WORD_WIDTH +: `WORD_WIDTH] = result_data_viota32[i];
+                      result_data[i*`WORD_WIDTH +: `WORD_WIDTH] = (`WORD_WIDTH)'(result_data_viota32[i]);
                     end
                   end
                 endcase
@@ -502,7 +502,7 @@
 //
 // submit result to ROB
 //
-  assign vstart_onehot = 1'b1<<vstart;
+  assign vstart_onehot = (`VLEN)'('b1)<<vstart;
   assign vstart_onehot_sub1 = vstart_onehot - 1'b1;
 
   always_comb begin
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask_viota.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask_viota.sv
index 30a9876..3d3b5cd 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask_viota.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_mask_viota.sv
@@ -143,10 +143,11 @@
 
   generate
     for(j=0;j<7;j++) begin: GET_VIOTA32_27_0
-      assign result_viota32[j] = result_viota7[0][j];
-      assign result_viota32[j+7] = result_viota7[1][j]+result_viota7[0][6];
-      assign result_viota32[j+14] = sum_20to14[j]+{carry_20to14[j],1'b0};
-      assign result_viota32[j+21] = sum_27to21[j]+{({1'b0,carry_27to21[j]}+{1'b0,cout_27to21[j]}),1'b0};
+      assign result_viota32[j] = ($clog2(32)+1)'(result_viota7[0][j]);
+      assign result_viota32[j+7] = ($clog2(32)+1)'(result_viota7[1][j])+($clog2(32)+1)'(result_viota7[0][6]);
+      assign result_viota32[j+14] = ($clog2(32)+1)'(sum_20to14[j])+($clog2(32)+1)'({carry_20to14[j],1'b0});
+      assign result_viota32[j+21] = ($clog2(32)+1)'(sum_27to21[j])+($clog2(32)+1)'({({1'b0,carry_27to21[j]})+($clog2(32)+1)'({1'b0,cout_27to21[j]}),1'b0});
+
 
       compressor_3_2
       #(
@@ -179,7 +180,9 @@
     end
 
     for(j=0;j<4;j++) begin: GET_VIOTA32_31_28
-      assign result_viota32[j+28] = sum_31to28[j]+{({1'b0,carry_31to28[j]}+{1'b0,cout_31to28[j]}),1'b0};
+      assign result_viota32[j+28] = ($clog2(32)+1)'(sum_31to28[j])+
+                                    ($clog2(32)+1)'({({1'b0,carry_31to28[j]})+
+                                    ($clog2(32)+1)'({1'b0,cout_31to28[j]}),1'b0});
 
       compressor_4_2
       #(
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_other.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_other.sv
index a9fd507..b9d5498 100644
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_other.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_other.sv
@@ -371,36 +371,36 @@
           VZEXT_VF2: begin
             case(vs2_eew)
               EEW8: begin
-                result_data_extend[(2*j  )*`HWORD_WIDTH +: `HWORD_WIDTH] = src2_data[(2*j  )*`BYTE_WIDTH +: `BYTE_WIDTH];
-                result_data_extend[(2*j+1)*`HWORD_WIDTH +: `HWORD_WIDTH] = src2_data[(2*j+1)*`BYTE_WIDTH +: `BYTE_WIDTH];
+                result_data_extend[(2*j  )*`HWORD_WIDTH +: `HWORD_WIDTH] = {8'b0, src2_data[(2*j  )*`BYTE_WIDTH +: `BYTE_WIDTH]};
+                result_data_extend[(2*j+1)*`HWORD_WIDTH +: `HWORD_WIDTH] = {8'b0, src2_data[(2*j+1)*`BYTE_WIDTH +: `BYTE_WIDTH]};
               end
               EEW16: begin
-                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = src2_data[j*`HWORD_WIDTH +: `HWORD_WIDTH];
+                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = {16'b0, src2_data[j*`HWORD_WIDTH +: `HWORD_WIDTH]};
               end
             endcase
           end
           VSEXT_VF2: begin
             case(vs2_eew)
               EEW8: begin
-                result_data_extend[(2*j  )*`HWORD_WIDTH +: `HWORD_WIDTH] = $signed(src2_data[(2*j  )*`BYTE_WIDTH +: `BYTE_WIDTH]);
-                result_data_extend[(2*j+1)*`HWORD_WIDTH +: `HWORD_WIDTH] = $signed(src2_data[(2*j+1)*`BYTE_WIDTH +: `BYTE_WIDTH]);
+                result_data_extend[(2*j  )*`HWORD_WIDTH +: `HWORD_WIDTH] = {{8{src2_data[(2*j+1)*`BYTE_WIDTH-1]}}, src2_data[(2*j  )*`BYTE_WIDTH +: `BYTE_WIDTH]};
+                result_data_extend[(2*j+1)*`HWORD_WIDTH +: `HWORD_WIDTH] = {{8{src2_data[(2*j+2)*`BYTE_WIDTH-1]}}, src2_data[(2*j+1)*`BYTE_WIDTH +: `BYTE_WIDTH]};
               end
               EEW16: begin
-                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = $signed(src2_data[j*`HWORD_WIDTH +: `HWORD_WIDTH]);
+                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = {{16{src2_data[(j+1)*`HWORD_WIDTH-1]}},src2_data[j*`HWORD_WIDTH +: `HWORD_WIDTH]};
               end
             endcase
           end
           VZEXT_VF4: begin
             case(vs2_eew)
               EEW8: begin
-                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = src2_data[j*`BYTE_WIDTH +: `BYTE_WIDTH];
+                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = {24'b0, src2_data[j*`BYTE_WIDTH +: `BYTE_WIDTH]};
               end
             endcase
           end
           VSEXT_VF4: begin       
             case(vs2_eew)
               EEW8: begin
-                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = $signed(src2_data[j*`BYTE_WIDTH +: `BYTE_WIDTH]);
+                result_data_extend[j*`WORD_WIDTH +: `WORD_WIDTH] = {{24{src2_data[(j+1)*`BYTE_WIDTH-1]}}, src2_data[j*`BYTE_WIDTH +: `BYTE_WIDTH]};
               end
             endcase
           end
diff --git a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
index ed55d3e..b84b564 100755
--- a/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_alu_unit_shift.sv
@@ -168,11 +168,11 @@
                 end
                 for(int i=`VLENB/2;i<`VLENB*3/4;i=i+1) begin
                   src2_data16[   i-`VLENB/2] = {8'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount16[i-`VLENB/2] =       vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)];
+                  shift_amount16[i-`VLENB/2] = {1'b0,vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)]};
                 end         
                 for(int i=`VLENB*3/4;i<`VLENB;i=i+1) begin
                   src2_data32[   i-`VLENB*3/4] = {24'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount32[i-`VLENB*3/4] =        vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)];
+                  shift_amount32[i-`VLENB*3/4] = {2'b0, vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)]};
                 end
               end
               EEW16: begin
@@ -182,7 +182,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] =        vs1_data[i*`HWORD_WIDTH +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, vs1_data[i*`HWORD_WIDTH +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -204,11 +204,11 @@
                 end
                 for(int i=`VLENB/2;i<`VLENB*3/4;i=i+1) begin
                   src2_data16[   i-`VLENB/2] = {{8{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount16[i-`VLENB/2] = vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)];
+                  shift_amount16[i-`VLENB/2] = {1'b0,vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)]};
                 end         
                 for(int i=`VLENB*3/4;i<`VLENB;i=i+1) begin
                   src2_data32[   i-`VLENB*3/4] = {{24{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount32[i-`VLENB*3/4] = vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)];
+                  shift_amount32[i-`VLENB*3/4] = {2'b0,vs1_data[i*`BYTE_WIDTH +: $clog2(`BYTE_WIDTH)]};
                 end
               end
               EEW16: begin
@@ -218,7 +218,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = vs1_data[i*`HWORD_WIDTH +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0,vs1_data[i*`HWORD_WIDTH +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -244,9 +244,9 @@
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
                   if (uop_index[0]==1'b0)
-                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] =      vs1_data[i*`BYTE_WIDTH  +: $clog2(`HWORD_WIDTH)];
+                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, vs1_data[i*`BYTE_WIDTH  +: $clog2(`HWORD_WIDTH)]};
                   else
-                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] =      vs1_data[`VLEN/2+i*`BYTE_WIDTH  +: $clog2(`HWORD_WIDTH)];
+                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, vs1_data[`VLEN/2+i*`BYTE_WIDTH  +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -275,9 +275,9 @@
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
                   if (uop_index[0]==1'b0)
-                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = vs1_data[i*`BYTE_WIDTH +: $clog2(`HWORD_WIDTH)];
+                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, vs1_data[i*`BYTE_WIDTH +: $clog2(`HWORD_WIDTH)]};
                   else
-                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = vs1_data[`VLEN/2+i*`BYTE_WIDTH +: $clog2(`HWORD_WIDTH)];
+                    shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, vs1_data[`VLEN/2+i*`BYTE_WIDTH +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -308,11 +308,11 @@
                 end
                 for(int i=`VLENB/2;i<`VLENB*3/4;i=i+1) begin
                   src2_data16[   i-`VLENB/2] = {8'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount16[i-`VLENB/2] =       rs1_data[0             +: $clog2(`BYTE_WIDTH)];
+                  shift_amount16[i-`VLENB/2] = {1'b0,rs1_data[0             +: $clog2(`BYTE_WIDTH)]};
                 end         
                 for(int i=`VLENB*3/4;i<`VLENB;i=i+1) begin
                   src2_data32[   i-`VLENB*3/4] = {24'b0,vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount32[i-`VLENB*3/4] =        rs1_data[0             +: $clog2(`BYTE_WIDTH)];
+                  shift_amount32[i-`VLENB*3/4] = {2'b0, rs1_data[0             +: $clog2(`BYTE_WIDTH)]};
                 end
               end
               EEW16: begin
@@ -322,7 +322,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] =        rs1_data[0              +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, rs1_data[0              +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -344,11 +344,11 @@
                 end
                 for(int i=`VLENB/2;i<`VLENB*3/4;i=i+1) begin
                   src2_data16[   i-`VLENB/2] = {{8{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount16[i-`VLENB/2] = rs1_data[0 +: $clog2(`BYTE_WIDTH)];
+                  shift_amount16[i-`VLENB/2] = {1'b0,rs1_data[0 +: $clog2(`BYTE_WIDTH)]};
                 end         
                 for(int i=`VLENB*3/4;i<`VLENB;i=i+1) begin
                   src2_data32[   i-`VLENB*3/4] = {{24{vs2_data[(i+1)*`BYTE_WIDTH-1]}},vs2_data[i*`BYTE_WIDTH +: `BYTE_WIDTH]};
-                  shift_amount32[i-`VLENB*3/4] = rs1_data[0 +: $clog2(`BYTE_WIDTH)];
+                  shift_amount32[i-`VLENB*3/4] = {2'b0,rs1_data[0 +: $clog2(`BYTE_WIDTH)]};
                 end
               end
               EEW16: begin
@@ -358,7 +358,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = rs1_data[0 +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0,rs1_data[0 +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -380,7 +380,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {16'b0,vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] =        rs1_data[0              +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0, rs1_data[0              +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
@@ -402,7 +402,7 @@
                 end
                 for(int i=`VLEN/`HWORD_WIDTH/2;i<`VLEN/`HWORD_WIDTH;i=i+1) begin
                   src2_data32[   i-`VLEN/`HWORD_WIDTH/2] = {{16{vs2_data[(i+1)*`HWORD_WIDTH-1]}},vs2_data[i*`HWORD_WIDTH +: `HWORD_WIDTH]};
-                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = rs1_data[0 +: $clog2(`HWORD_WIDTH)];
+                  shift_amount32[i-`VLEN/`HWORD_WIDTH/2] = {1'b0,rs1_data[0 +: $clog2(`HWORD_WIDTH)]};
                 end   
               end
               EEW32: begin
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
index ee86f5f..61b3e2c 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
@@ -3616,17 +3616,14 @@
     uop_index_max = 'b0;
     
     case(emul_max)
-      EMUL1: begin
-        uop_index_max = 'b0;
-      end
       EMUL2: begin
-        uop_index_max = 'd1;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d1);
       end
       EMUL4: begin
-        uop_index_max = 'd3;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d3);
       end
       EMUL8: begin
-        uop_index_max = 'd7;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d7);
       end
     endcase
   end
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
index e4af58f..a940fe2 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
@@ -3049,7 +3049,7 @@
   // calculate the uop_index used in decoding uops 
   generate
     for(j=0;j<`NUM_DE_UOP;j++) begin: GET_UOP_INDEX
-      assign uop_index_current[j] = j[`UOP_INDEX_WIDTH-1:0]+uop_index_base;
+      assign uop_index_current[j] = (`UOP_INDEX_WIDTH+1)'(j[`UOP_INDEX_WIDTH-1:0]+uop_index_base);
     end
   endgenerate
 
@@ -3061,29 +3061,26 @@
     uop_index_max = 'b0;
     
     case(emul_max)
-      EMUL1: begin
-        uop_index_max = 'd0;
-      end
       EMUL2: begin
-        uop_index_max = 'd1;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d1);
       end
       EMUL3: begin
-        uop_index_max = 'd2;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d2);
       end
       EMUL4: begin
-        uop_index_max = 'd3;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d3);
       end
       EMUL5: begin
-        uop_index_max = 'd4;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d4);
       end
       EMUL6: begin
-        uop_index_max = 'd5;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d5);
       end
       EMUL7: begin
-        uop_index_max = 'd6;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d6);
       end
       EMUL8: begin
-        uop_index_max = 'd7;
+        uop_index_max = (`UOP_INDEX_WIDTH)'('d7);
       end
     endcase
   end
diff --git a/hdl/verilog/rvv/design/rvv_backend_dispatch.sv b/hdl/verilog/rvv/design/rvv_backend_dispatch.sv
index ed76dc5..e7ef053 100755
--- a/hdl/verilog/rvv/design/rvv_backend_dispatch.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_dispatch.sv
@@ -150,7 +150,7 @@
     generate

       for (i=0; i<`NUM_DP_UOP; i++) begin : gen_vlmax

         assign vlmax_shift[i] ={1'b0, uop_uop2dp[i].vector_csr.lmul[1:0]} + $clog2(`VLENB) - uop_uop2dp[i].vector_csr.sew - {uop_uop2dp[i].vector_csr.lmul[2],2'b00};

-        assign vlmax[i] = 'h1 << vlmax_shift[i];

+        assign vlmax[i] = (`VL_WIDTH)'(1) << vlmax_shift[i];

       end

     endgenerate

 

diff --git a/hdl/verilog/rvv/design/rvv_backend_dispatch_opr_byte_type.sv b/hdl/verilog/rvv/design/rvv_backend_dispatch_opr_byte_type.sv
index 770243a..2db3a49 100644
--- a/hdl/verilog/rvv/design/rvv_backend_dispatch_opr_byte_type.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_dispatch_opr_byte_type.sv
@@ -86,7 +86,7 @@
         always_comb begin
           case (uop_info.uop_exe_unit)
             RDT:begin
-              uop_vs2_start = uop_info.uop_index << (VLENB_WIDTH - vs2_eew_shift);
+              uop_vs2_start = (`VSTART_WIDTH)'(uop_info.uop_index) << (VLENB_WIDTH - vs2_eew_shift);
             end
             default:begin
               case({eew_max,uop_info.vs2_eew})
@@ -94,16 +94,16 @@
                 {EEW16,EEW16},
                 {EEW8,EEW8}: begin
                   // regular and narrowing instruction
-                  uop_vs2_start = uop_info.uop_index << (VLENB_WIDTH - vs2_eew_shift);
+                  uop_vs2_start = (`VSTART_WIDTH)'(uop_info.uop_index) << (VLENB_WIDTH - vs2_eew_shift);
                 end
                 {EEW32,EEW16},
                 {EEW16,EEW8}: begin
                   // widening instruction: EEW_vd:EEW_vs = 2:1
-                  uop_vs2_start = uop_info.uop_index[`UOP_INDEX_WIDTH-1:1] << (VLENB_WIDTH - vs2_eew_shift);
+                  uop_vs2_start = (`VSTART_WIDTH)'(uop_info.uop_index[`UOP_INDEX_WIDTH-1:1]) << (VLENB_WIDTH - vs2_eew_shift);
                 end
                 {EEW32,EEW8}: begin
                   // widening instruction: EEW_vd:EEW_vs = 4:1
-                  uop_vs2_start = uop_info.uop_index[`UOP_INDEX_WIDTH-1:2] << (VLENB_WIDTH - vs2_eew_shift);
+                  uop_vs2_start = (`VSTART_WIDTH)'(uop_info.uop_index[`UOP_INDEX_WIDTH-1:2]) << (VLENB_WIDTH - vs2_eew_shift);
                 end
                 default: begin
                   uop_vs2_start = 'b0;
@@ -118,7 +118,7 @@
         for (i=0; i<`VLENB; i++) begin : gen_vs2_byte_type
             // ele_index = uop_index * (VLEN/vs2_eew) + BYTE_INDEX[MSB:vs2_eew]
             assign vs2_enable[i] = uop_info.vm ? 1'b1 : vs2_enable_tmp[i >> vs2_eew_shift];
-            assign vs2_ele_index[i] = uop_vs2_start + (i >> vs2_eew_shift);
+            assign vs2_ele_index[i] = (`VL_WIDTH)'(uop_vs2_start) + (i >> vs2_eew_shift);
             always_comb begin
                 if (uop_info.ignore_vta&uop_info.ignore_vma)
                     vs2[i] = BODY_ACTIVE;       
@@ -127,8 +127,7 @@
                 else if (vs2_ele_index[i] < {1'b0, uop_info.vstart}) 
                     vs2[i] = NOT_CHANGE; // prestart
                 else begin 
-                    vs2[i] = (vs2_enable[i] || uop_info.ignore_vma) ? BODY_ACTIVE
-                                                                                     : BODY_INACTIVE;
+                    vs2[i] = (vs2_enable[i] || uop_info.ignore_vma) ? BODY_ACTIVE : BODY_INACTIVE;
                 end
             end
         end
@@ -150,21 +149,21 @@
             {EEW32,EEW32},
             {EEW16,EEW16},
             {EEW8,EEW8}: begin
-              uop_v0_start = uop_info.uop_index << (VLENB_WIDTH - vd_eew_shift);
+              uop_v0_start = (`VSTART_WIDTH)'(uop_info.uop_index) << (VLENB_WIDTH - vd_eew_shift);
               uop_vd_start = uop_v0_start; 
               uop_vd_end = uop_vd_start + (`VLENB >> eew_max_shift) - 1'b1;
             end
             {EEW32,EEW16},
             {EEW16,EEW8}: begin
               // narrowing instruction: EEW_vd:EEW_vs = 1:2
-              uop_v0_start = uop_info.uop_index[`UOP_INDEX_WIDTH-1:1] << (VLENB_WIDTH - vd_eew_shift);
+              uop_v0_start = (`VSTART_WIDTH)'(uop_info.uop_index[`UOP_INDEX_WIDTH-1:1]) << (VLENB_WIDTH - vd_eew_shift);
               uop_vd_start = uop_info.uop_index[0] ? uop_v0_start + (`VLENB >> eew_max_shift):
                                                      uop_v0_start;
               uop_vd_end = uop_vd_start + (`VLENB >> eew_max_shift) - 1'b1 ; 
             end
             {EEW32,EEW8}: begin
               // narrowing instruction: EEW_vd:EEW_vs = 1:4
-              uop_v0_start = uop_info.uop_index[`UOP_INDEX_WIDTH-1:2] << VLENB_WIDTH;
+              uop_v0_start = (`VSTART_WIDTH)'(uop_info.uop_index[`UOP_INDEX_WIDTH-1:2]) << VLENB_WIDTH;
               case(uop_info.uop_index[1:0])
                 2'd3: begin
                   uop_vd_start = uop_v0_start + `VLENB*3/4;
@@ -195,7 +194,7 @@
           if (i==0) begin
             // ele_index = uop_index * (VLEN/vd_eew) + BYTE_INDEX[MSB:vd_eew]
             assign vd_enable[0] = uop_info.vm ? 1'b1 : vd_enable_tmp[0];
-            assign vd_ele_index[0] = uop_v0_start;
+            assign vd_ele_index[0] = (`VL_WIDTH)'(uop_v0_start);
 
             always_comb begin
               v0_strobe[0] = 'b0;
@@ -228,7 +227,7 @@
           end else begin
             // ele_index = uop_index * (VLEN/vd_eew) + BYTE_INDEX[MSB:vd_eew]
             assign vd_enable[i] = uop_info.vm ? 1'b1 : vd_enable_tmp[i >> vd_eew_shift];
-            assign vd_ele_index[i] = uop_v0_start + (i >> vd_eew_shift);
+            assign vd_ele_index[i] = (`VL_WIDTH)'(uop_v0_start) + (i >> vd_eew_shift);
 
             always_comb begin
               v0_strobe[i] = 'b0;
diff --git a/hdl/verilog/rvv/design/rvv_backend_div_unit.sv b/hdl/verilog/rvv/design/rvv_backend_div_unit.sv
index 77a66bd..d60697e 100755
--- a/hdl/verilog/rvv/design/rvv_backend_div_unit.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_div_unit.sv
@@ -319,7 +319,7 @@
     for(j=0;j<`VLENB/2;j++) begin: DIVIDER8
       rvv_backend_div_unit_divider
       #(
-        .DIV_WIDTH          (`BYTE_WIDTH)
+        .DIV_WIDTH          (8'd`BYTE_WIDTH)
       )
       divider_8bit
       (
@@ -342,7 +342,7 @@
     for(j=0;j<`VLEN/`HWORD_WIDTH/2;j++) begin: DIVIDER16
       rvv_backend_div_unit_divider
       #(
-        .DIV_WIDTH          (`HWORD_WIDTH)
+        .DIV_WIDTH          (8'd`HWORD_WIDTH)
       )
       divider_16bit
       (
@@ -365,7 +365,7 @@
     for(j=0;j<`VLEN/`WORD_WIDTH;j++) begin: DIVIDER32
       rvv_backend_div_unit_divider
       #(
-        .DIV_WIDTH          (`WORD_WIDTH)
+        .DIV_WIDTH          (8'd`WORD_WIDTH)
       )
       divider_32bit
       (
diff --git a/hdl/verilog/rvv/design/rvv_backend_div_unit_divider.sv b/hdl/verilog/rvv/design/rvv_backend_div_unit_divider.sv
index a25a32a..336feba 100755
--- a/hdl/verilog/rvv/design/rvv_backend_div_unit_divider.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_div_unit_divider.sv
@@ -20,16 +20,13 @@
   result_quotient,
   result_remainder,
   result_valid,
-`ifdef TB_SUPPORT
-  res_reuse_valid_p1,
-`endif
   result_ready,
   trap_flush_rvv
 );
 //
 // parameter
 //
-  parameter DIV_WIDTH = `WORD_WIDTH;
+  parameter logic[7:0] DIV_WIDTH = `WORD_WIDTH;
 
 //
 // interface signals
@@ -50,9 +47,6 @@
   output  logic [DIV_WIDTH-1:0] result_quotient;
   output  logic [DIV_WIDTH-1:0] result_remainder;
   output  logic                 result_valid;
-`ifdef TB_SUPPORT
-  output  logic                 res_reuse_valid_p1;
-`endif
   input   logic                 result_ready;
 
   // trap-flush
@@ -202,19 +196,6 @@
     .q      (r_sgn_q)
   );
 
-`ifdef TB_SUPPORT
-  always_ff @(posedge clk, negedge rst_n) begin
-    if(rst_n=='b0)
-      res_reuse_valid_p1 = 'b0;
-    else if(next_state==DIV_IDLE)
-      res_reuse_valid_p1 = 'b0;
-    else if((state==DIV_IDLE)&div_valid)
-      res_reuse_valid_p1 = res_reuse_valid_p0;
-    else
-      res_reuse_valid_p1 = res_reuse_valid_p1;
-  end
-`endif
-
 //
 // FSM
 //
@@ -266,17 +247,17 @@
     endcase
   end
 
-  // computational logic in every state
+  // count leading zero
   generate 
-    if (DIV_WIDTH==`WORD_WIDTH) begin
+    if (DIV_WIDTH== 'd`WORD_WIDTH) begin
       assign clzb = f_clzb32(dividend_d);
       assign count_shift = 'd33 - clzb;            
     end
-    else if (DIV_WIDTH==`HWORD_WIDTH) begin
+    else if (DIV_WIDTH== 'd`HWORD_WIDTH) begin
       assign clzb = f_clzb16(dividend_d);
       assign count_shift = 'd17 - clzb;            
     end
-    else if (DIV_WIDTH==`BYTE_WIDTH) begin
+    else if (DIV_WIDTH== 'd`BYTE_WIDTH) begin
       assign clzb = f_clzb8(dividend_d);
       assign count_shift = 'd9 - clzb;            
     end
diff --git a/hdl/verilog/rvv/design/rvv_backend_mac_unit.sv b/hdl/verilog/rvv/design/rvv_backend_mac_unit.sv
index d356171..6ae4f74 100644
--- a/hdl/verilog/rvv/design/rvv_backend_mac_unit.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_mac_unit.sv
@@ -726,12 +726,12 @@
                                                                                       mac_rslt_full_eew8_d1[i*4+j][7+:8] + {7'b0,vsmul_round_incr_eew8_d1[i*4+j]};//right shift 7bit then +"1"
         vsmul_sat_eew8_d1[i*4+j] = mac_rslt_full_eew8_d1[i*4+j][15:14] == 2'b01;
         //Below are for vmac related instructions
-        vmac_mul_add_eew8_no_widen_d1[i*4+j] = mac_addsrc_d1[8*(i*4+j) +: 8] + mac_rslt_eew8_no_widen_d1[8*(i*4+j) +: 8];//9bit
-        vmac_mul_sub_eew8_no_widen_d1[i*4+j] = mac_addsrc_d1[8*(i*4+j) +: 8] - mac_rslt_eew8_no_widen_d1[8*(i*4+j) +: 8];
+        vmac_mul_add_eew8_no_widen_d1[i*4+j] = {1'b0,mac_addsrc_d1[8*(i*4+j) +: 8]} + {1'b0,mac_rslt_eew8_no_widen_d1[8*(i*4+j) +: 8]};//9bit
+        vmac_mul_sub_eew8_no_widen_d1[i*4+j] = {1'b0,mac_addsrc_d1[8*(i*4+j) +: 8]} - {1'b0,mac_rslt_eew8_no_widen_d1[8*(i*4+j) +: 8]};
         vmac_rslt_eew8_no_widen_d1[8*(i*4+j) +:8] = mac_mul_reverse_d1 ? vmac_mul_sub_eew8_no_widen_d1[i*4+j][7:0] :
                                                                          vmac_mul_add_eew8_no_widen_d1[i*4+j][7:0];
-        vmac_mul_add_eew8_widen_d1[i*4+j] = mac_addsrc_widen_d1[16*(i*4+j) +: 16] + mac_rslt_eew8_widen_d1[16*(i*4+j) +: 16];//17bit
-        vmac_mul_sub_eew8_widen_d1[i*4+j] = mac_addsrc_widen_d1[16*(i*4+j) +: 16] - mac_rslt_eew8_widen_d1[16*(i*4+j) +: 16];
+        vmac_mul_add_eew8_widen_d1[i*4+j] = {1'b0,mac_addsrc_widen_d1[16*(i*4+j) +: 16]} + {1'b0,mac_rslt_eew8_widen_d1[16*(i*4+j) +: 16]};//17bit
+        vmac_mul_sub_eew8_widen_d1[i*4+j] = {1'b0,mac_addsrc_widen_d1[16*(i*4+j) +: 16]} - {1'b0,mac_rslt_eew8_widen_d1[16*(i*4+j) +: 16]};
         vmac_rslt_eew8_widen_d1[16*(i*4+j) +: 16] = mac_mul_reverse_d1 ? vmac_mul_sub_eew8_widen_d1[i*4+j][15:0] :
                                                                         vmac_mul_add_eew8_widen_d1[i*4+j][15:0];
     end
@@ -765,12 +765,12 @@
                                                                                        mac_rslt_full_eew16_d1[i*2+j][15+:16] + {15'b0,vsmul_round_incr_eew16_d1[i*2+j]};//right shift 15bit then +"1"
       vsmul_sat_eew16_d1[i*2+j] = mac_rslt_full_eew16_d1[i*2+j][31:30] == 2'b01;
       //Below are for vmac related instructions
-      vmac_mul_add_eew16_no_widen_d1[i*2+j] = mac_addsrc_d1[16*(i*2+j) +: 16] + mac_rslt_eew16_no_widen_d1[16*(i*2+j) +: 16];//17bit
-      vmac_mul_sub_eew16_no_widen_d1[i*2+j] = mac_addsrc_d1[16*(i*2+j) +: 16] - mac_rslt_eew16_no_widen_d1[16*(i*2+j) +: 16];
+      vmac_mul_add_eew16_no_widen_d1[i*2+j] = {1'b0,mac_addsrc_d1[16*(i*2+j) +: 16]} + {1'b0,mac_rslt_eew16_no_widen_d1[16*(i*2+j) +: 16]};//17bit
+      vmac_mul_sub_eew16_no_widen_d1[i*2+j] = {1'b0,mac_addsrc_d1[16*(i*2+j) +: 16]} - {1'b0,mac_rslt_eew16_no_widen_d1[16*(i*2+j) +: 16]};
       vmac_rslt_eew16_no_widen_d1[16*(i*2+j) +:16] = mac_mul_reverse_d1 ? vmac_mul_sub_eew16_no_widen_d1[i*2+j][15:0] :
                                                                           vmac_mul_add_eew16_no_widen_d1[i*2+j][15:0];
-      vmac_mul_add_eew16_widen_d1[i*2+j] = mac_addsrc_widen_d1[32*(i*2+j) +: 32] + mac_rslt_eew16_widen_d1[32*(i*2+j) +: 32];//33bit
-      vmac_mul_sub_eew16_widen_d1[i*2+j] = mac_addsrc_widen_d1[32*(i*2+j) +: 32] - mac_rslt_eew16_widen_d1[32*(i*2+j) +: 32];
+      vmac_mul_add_eew16_widen_d1[i*2+j] = {1'b0,mac_addsrc_widen_d1[32*(i*2+j) +: 32]} + {1'b0,mac_rslt_eew16_widen_d1[32*(i*2+j) +: 32]};//33bit
+      vmac_mul_sub_eew16_widen_d1[i*2+j] = {1'b0,mac_addsrc_widen_d1[32*(i*2+j) +: 32]} - {1'b0,mac_rslt_eew16_widen_d1[32*(i*2+j) +: 32]};
       vmac_rslt_eew16_widen_d1[32*(i*2+j) +: 32] = mac_mul_reverse_d1 ? vmac_mul_sub_eew16_widen_d1[i*2+j][31:0] :
                                                                        vmac_mul_add_eew16_widen_d1[i*2+j][31:0];
     end
@@ -822,12 +822,12 @@
                                                                              mac_rslt_full_eew32_d1[i][31+:32] + {31'b0,vsmul_round_incr_eew32_d1[i]};//right shift 31bit then +"1"
     vsmul_sat_eew32_d1[i] = mac_rslt_full_eew32_d1[i][63:62] == 2'b01;
     //Below are for vmac related instructions
-    vmac_mul_add_eew32_no_widen_d1[i] = mac_addsrc_d1[32*i +: 32] + mac_rslt_eew32_no_widen_d1[32*i +: 32];//33bit
-    vmac_mul_sub_eew32_no_widen_d1[i] = mac_addsrc_d1[32*i +: 32] - mac_rslt_eew32_no_widen_d1[32*i +: 32];
+    vmac_mul_add_eew32_no_widen_d1[i] = {1'b0,mac_addsrc_d1[32*i +: 32]} + {1'b0,mac_rslt_eew32_no_widen_d1[32*i +: 32]};//33bit
+    vmac_mul_sub_eew32_no_widen_d1[i] = {1'b0,mac_addsrc_d1[32*i +: 32]} - {1'b0,mac_rslt_eew32_no_widen_d1[32*i +: 32]};
     vmac_rslt_eew32_no_widen_d1[32*i +:32] = mac_mul_reverse_d1 ? vmac_mul_sub_eew32_no_widen_d1[i][31:0] :
                                                                   vmac_mul_add_eew32_no_widen_d1[i][31:0];
-    vmac_mul_add_eew32_widen_d1[i] = mac_addsrc_widen_d1[64*i +: 64] + mac_rslt_eew32_widen_d1[64*i +: 64];//65bit
-    vmac_mul_sub_eew32_widen_d1[i] = mac_addsrc_widen_d1[64*i +: 64] - mac_rslt_eew32_widen_d1[64*i +: 64];
+    vmac_mul_add_eew32_widen_d1[i] = {1'b0,mac_addsrc_widen_d1[64*i +: 64]} + {1'b0,mac_rslt_eew32_widen_d1[64*i +: 64]};//65bit
+    vmac_mul_sub_eew32_widen_d1[i] = {1'b0,mac_addsrc_widen_d1[64*i +: 64]} - {1'b0,mac_rslt_eew32_widen_d1[64*i +: 64]};
     vmac_rslt_eew32_widen_d1[64*i +: 64] = mac_mul_reverse_d1 ? vmac_mul_sub_eew32_widen_d1[i][63:0] :
                                                                vmac_mul_add_eew32_widen_d1[i][63:0];
   end
diff --git a/hdl/verilog/rvv/design/rvv_backend_mul_unit_mul8.sv b/hdl/verilog/rvv/design/rvv_backend_mul_unit_mul8.sv
index c291c85..360cfce 100644
--- a/hdl/verilog/rvv/design/rvv_backend_mul_unit_mul8.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_mul_unit_mul8.sv
@@ -24,7 +24,7 @@
 wire [8:0] in0_int = {in0_is_signed&in0[7],in0};
 wire [8:0] in1_int = {in1_is_signed&in1[7],in1};
 
-wire [17:0] out_int = $signed(in0_int)*$signed(in1_int);
+wire [17:0] out_int = {{9{in0_int[8]}},in0_int} * {{9{in1_int[8]}},in1_int};
 
 assign out = out_int[0+:16];
 
diff --git a/hdl/verilog/rvv/design/rvv_backend_pmtrdt_unit.sv b/hdl/verilog/rvv/design/rvv_backend_pmtrdt_unit.sv
index 862e469..09417e4 100644
--- a/hdl/verilog/rvv/design/rvv_backend_pmtrdt_unit.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_pmtrdt_unit.sv
@@ -138,6 +138,7 @@
   logic                         pmt_go, pmt_go_q; // start to execute pmt inst when all uop(s) are in RS
   logic [`UOP_INDEX_WIDTH-1:0]  pmt_uop_done_cnt_d, pmt_uop_done_cnt_q;
   logic [`VLENB-1:0][`XLEN+1:0] offset;
+  logic [`VLENB-1:0][`XLEN+1:0] slide_down_offset;
   logic [`VLENB-1:0]            sel_scalar;
   BYTE_TYPE_t                   vd_type;
   logic [`VLMAX_MAX-1:0][7:0]   pmt_vs2_data, pmt_vs3_data;
@@ -2244,12 +2245,12 @@
       // cmp_res_d/cmp_res_q
       always_comb begin
         case (pmtrdt_uop.vs2_eew)
-          EEW32: cmp_res_en = {'0, 1'b1} << cmp_res_en_offset;
-          EEW16: cmp_res_en = {'0, 2'b11} << cmp_res_en_offset;
-          default: cmp_res_en = {'0, 4'b1111} << cmp_res_en_offset;
+          EEW32: cmp_res_en = (2*`VLENB)'('b1)  << cmp_res_en_offset;
+          EEW16: cmp_res_en = (2*`VLENB)'('b11) << cmp_res_en_offset;
+          default: cmp_res_en = (2*`VLENB)'('b1111) << cmp_res_en_offset;
         endcase
       end
-      assign cmp_res_d = {'0, cmp_res} << cmp_res_offset;
+      assign cmp_res_d = (`VLEN)'(cmp_res) << cmp_res_offset;
       for (i=0; i<(2*`VLENB); i++) begin
         edff #(.T(logic[`VLEN/32-1:0])) cmp_res_reg (.q(cmp_res_q[`VLEN/32*i+:`VLEN/32]), .d(cmp_res_d[`VLEN/32*i+:`VLEN/32]), .e(cmp_res_en[i] & pmtrdt_uop_valid & pmtrdt_uop_ready), .clk(clk), .rst_n(rst_n));
       end
@@ -2302,29 +2303,29 @@
             SLIDE_UP:begin
               if (pmtrdt_uop.uop_funct3 == OPMVX)
                 case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                  EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-4;
-                  EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-2;
-                  default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-1; 
+                  EEW32:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-4);
+                  EEW16:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-2);
+                  default:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i-1);
                 endcase
               else
                 case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                  EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (4*pmtrdt_uop.rs1_data);
-                  EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (2*pmtrdt_uop.rs1_data);
-                  default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+ i - pmtrdt_uop.rs1_data; 
+                  EEW32:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (4*pmtrdt_uop.rs1_data));
+                  EEW16:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i - (2*pmtrdt_uop.rs1_data));
+                  default:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+ i - pmtrdt_uop.rs1_data);
                 endcase
             end
             SLIDE_DOWN:begin
               if (pmtrdt_uop.uop_funct3 == OPMVX)
                 case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                  EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+4;
-                  EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+2;
-                  default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+1;
+                  EEW32:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+4);
+                  EEW16:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+2);
+                  default:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB+i+1);
                 endcase
               else
                 case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                  EEW32:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (4*pmtrdt_uop.rs1_data);
-                  EEW16:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (2*pmtrdt_uop.rs1_data);
-                  default:offset[i] = uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + pmtrdt_uop.rs1_data;
+                  EEW32:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (4*pmtrdt_uop.rs1_data));
+                  EEW16:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + (2*pmtrdt_uop.rs1_data));
+                  default:offset[i] = (`XLEN+2)'(uop_data[pmt_uop_done_cnt_q].uop_index*`VLENB + i + pmtrdt_uop.rs1_data);
                 endcase
             end
             GATHER:begin
@@ -2332,27 +2333,27 @@
                 OPIVX,
                 OPIVI:begin
                   case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                    EEW32:offset[i] = i%4 + {pmtrdt_uop.rs1_data,2'b0};
-                    EEW16:offset[i] = i%2 + {pmtrdt_uop.rs1_data,1'b0};
-                    default:offset[i] = pmtrdt_uop.rs1_data;
+                    EEW32:offset[i] = (`XLEN+2)'(i%4 + {pmtrdt_uop.rs1_data,2'b0});
+                    EEW16:offset[i] = (`XLEN+2)'(i%2 + {pmtrdt_uop.rs1_data,1'b0});
+                    default:offset[i] = (`XLEN+2)'(pmtrdt_uop.rs1_data);
                   endcase
                 end
                 default:begin
                   case (pmtrdt_uop.vs1_eew)
-                    EEW32: offset[i] = i%4 + (4*{{(`XLEN-32){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[32*((i/4)%(`VLENB/4))+:32]});
+                    EEW32: offset[i] = (`XLEN+2)'(i%4 + (4*{{(`XLEN-32){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[32*((i/4)%(`VLENB/4))+:32]}));
                     EEW16: begin
                       case (pmtrdt_uop.vs2_eew) // vrgatherei16
-                        EEW32:offset[i] = i%4 + (4*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[16*((pmt_uop_done_cnt_q*`VLENB/4+i/4)%(`VLENB/2))+:16]});
-                        EEW16:offset[i] = i%2 + (2*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/2+i/2)/(`VLENB/2)].vs1_data[16*((i/2)%(`VLENB/2))+:16]});
-                        default:offset[i] = {{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[16*(i%(`VLENB/2))+:16]};
+                        EEW32:offset[i] = (`XLEN+2)'(i%4 + (4*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/4+i/4)/(`VLENB/4)].vs1_data[16*((pmt_uop_done_cnt_q*`VLENB/4+i/4)%(`VLENB/2))+:16]}));
+                        EEW16:offset[i] = (`XLEN+2)'(i%2 + (2*{{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB/2+i/2)/(`VLENB/2)].vs1_data[16*((i/2)%(`VLENB/2))+:16]}));
+                        default:offset[i] = (`XLEN+2)'({{(`XLEN-16){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[16*(i%(`VLENB/2))+:16]});
                       endcase
                     end
-                    default: offset[i] = {{(`XLEN-8){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[8*(i%(`VLENB))+:8]};
+                    default: offset[i] = (`XLEN+2)'({{(`XLEN-8){1'b0}}, uop_data[(pmt_uop_done_cnt_q*`VLENB+i)/(`VLENB)].vs1_data[8*(i%(`VLENB))+:8]});
                   endcase
                 end
               endcase
             end
-            default: offset[i] = i;
+            default: offset[i] = (`XLEN+2)'(i);
           endcase
         end
       end
@@ -2366,18 +2367,21 @@
             SLIDE_UP:begin
               if (pmt_uop_done_cnt_q == 0)
                 case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                  EEW32:sel_scalar = 'hF;
-                  EEW16:sel_scalar = 'h3;
-                  default:sel_scalar = 'h1;
+                  EEW32:sel_scalar = (`VLENB)'('hF);
+                  EEW16:sel_scalar = (`VLENB)'('h3);
+                  default:sel_scalar = (`VLENB)'('h1);
                 endcase
               else
                 sel_scalar = '0;
             end
             SLIDE_DOWN:begin
               case (pmtrdt_uop.vs2_eew) // Permutation instruction: vd_eew == vs2_eew
-                EEW32:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/4) >= rdt_ctrl.vl ? 'hF << ((rdt_ctrl.vl-1)%(`VLENB/4))*4 : '0;
-                EEW16:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/2) >= rdt_ctrl.vl ? 'h3 << ((rdt_ctrl.vl-1)%(`VLENB/2))*2 : '0;
-                default:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*`VLENB >= rdt_ctrl.vl ? 'h1 << ((rdt_ctrl.vl-1)%(`VLENB))*1 : '0;
+                EEW32:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/4) >= rdt_ctrl.vl ?
+                                    (`VLENB)'('hF) << ((rdt_ctrl.vl-1)%(`VLENB/4))*4 : '0;
+                EEW16:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*(`VLENB/2) >= rdt_ctrl.vl ?
+                                    (`VLENB)'('h3) << ((rdt_ctrl.vl-1)%(`VLENB/2))*2 : '0;
+                default:sel_scalar = (uop_data[pmt_uop_done_cnt_q].uop_index+1'b1)*`VLENB >= rdt_ctrl.vl ?
+                                    (`VLENB)'('h1) << ((rdt_ctrl.vl-1)%(`VLENB))*1 : '0;
               endcase
             end
             default:sel_scalar = '0;
@@ -2415,34 +2419,29 @@
       assign pmt_res_en = pmt_go;
       for (i=0; i<`VLENB; i++) begin
         always_comb begin
+          slide_down_offset[i] = offset[i]-(pmtrdt_uop.uop_index*`VLENB);
           if (sel_scalar[i]) pmt_res_d[i] = pmt_rs1_data[8*(i%4)+:8];
           else
             case (pmt_ctrl.pmt_opr)
               SLIDE_UP:begin
                 case (pmtrdt_uop.vs2_eew) // permutation instruction
-                  // TODO(derekjchow): Fix me
-                  // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]];
-                  // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]];
-                  // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i]];
-                  default: pmt_res_d[i] = 0;
+                  EEW32: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(4*pmtrdt_uop.vlmax) ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i][7:0]];
+                  EEW16: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(2*pmtrdt_uop.vlmax) ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i][7:0]];
+                  default: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(pmtrdt_uop.vlmax) ? pmt_vs3_data[pmt_uop_done_cnt_q*`VLENB+i] : pmt_vs2_data[offset[i][7:0]];
                 endcase
               end
               SLIDE_DOWN:begin
                 case (pmtrdt_uop.vs2_eew)
-                  // TODO(derekjchow): Fix me
-                  // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)];
-                  // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)];
-                  // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]-(pmtrdt_uop.uop_index*`VLENB)];
-                  default: pmt_res_d[i] = 0;
+                  EEW32: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(4*pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[slide_down_offset[i][7:0]];
+                  EEW16: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(2*pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[slide_down_offset[i][7:0]];
+                  default: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[slide_down_offset[i][7:0]];
                 endcase
               end
               default: begin
                 case (pmtrdt_uop.vs2_eew)
-                  // TODO(derekjchow): Fix me
-                  // EEW32: pmt_res_d[i] = offset[i] >= 4*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]];
-                  // EEW16: pmt_res_d[i] = offset[i] >= 2*pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]];
-                  // default: pmt_res_d[i] = offset[i] >= pmtrdt_uop.vlmax ? '0 : pmt_vs2_data[offset[i]];
-                  default: pmt_res_d[i] = 0;
+                  EEW32: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(4*pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[offset[i][7:0]];
+                  EEW16: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(2*pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[offset[i][7:0]];
+                  default: pmt_res_d[i] = offset[i] >= (`XLEN+2)'(pmtrdt_uop.vlmax) ? '0 : pmt_vs2_data[offset[i][7:0]];
                 endcase
               end
             endcase
@@ -2519,14 +2518,14 @@
 
       // compress_res is driven by compress_value and compress_cnt.
       always_comb begin
-        if (pmtrdt_uop.first_uop_valid) compress_res_d = {'0, compress_value};
+        if (pmtrdt_uop.first_uop_valid) compress_res_d = (2*`VLENB*8)'(compress_value);
         else                            compress_res_d = f_circular_shift(compress_value, compress_cnt_q);
       end
 
       // compress_res_en
       always_comb begin
         if (compress_ctrl_push)
-          if (pmtrdt_uop.first_uop_valid) compress_res_en = {'0, f_pack_1s(compress_enable)};
+          if (pmtrdt_uop.first_uop_valid) compress_res_en = (2*`VLENB)'(f_pack_1s(compress_enable));
           else                            compress_res_en = f_circular_en(compress_enable,compress_cnt_q);
         else 
           compress_res_en = '0;
@@ -2681,7 +2680,7 @@
       for (i=0; i<`VLENB; i++) results[i] = '1;
       for (i=0; i<`VLENB; i++) begin
         if (enables[i]) begin
-          results[j] = i;
+          results[j] = (VLENB_WIDTH+1)'(i);
           j++;
         end
       end
@@ -2699,7 +2698,7 @@
     logic [1:0][`VLEN-1:0]  result;
     begin
       value_tmp = value;
-      {buf2,buf1,buf0} = value_tmp << (shift*8);
+      {buf2,buf1,buf0} = (3*`VLEN)'(value_tmp) << (shift*8);
       result = shift[VLENB_WIDTH] ? {buf1, buf2} : {buf1,buf0};
       f_circular_shift = result;
     end
@@ -2733,7 +2732,7 @@
     logic [1:0][`VLENB-1:0] result;
     begin
       value_pack_1s = f_pack_1s(value);
-      {en2,en1,en0} = value_pack_1s << shift;
+      {en2,en1,en0} = (3*`VLENB)'(value_pack_1s) << shift;
       result = shift[VLENB_WIDTH] ? {en1, en2} : {en1, en0};
       f_circular_en = result;
     end