Adjust vector register order in Decoder for segment load/store. Update rvv_backend_tb for lsu changes.

Change-Id: I60d55196d033b70f5e64baddf17ae2d40e96c574
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
index 61b3e2c..dd4febd 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_ari.sv
@@ -102,15 +102,18 @@
   logic   [`NUM_DE_UOP-1:0]                           force_vta_agnostic; 
   logic   [`NUM_DE_UOP-1:0]                           vm;                 
   logic   [`NUM_DE_UOP-1:0]                           v0_valid;           
-  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vd_index;           
+  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vd_index;
+  logic   [`NUM_DE_UOP-1:0][`UOP_INDEX_WIDTH-1:0]     vd_offset;
   EEW_e   [`NUM_DE_UOP-1:0]                           vd_eew;  
   logic   [`NUM_DE_UOP-1:0]                           vd_valid;
   logic   [`NUM_DE_UOP-1:0]                           vs3_valid;          
-  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vs1;              
+  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vs1;
+  logic   [`NUM_DE_UOP-1:0][`UOP_INDEX_WIDTH-1:0]     vs1_offset;
   EEW_e   [`NUM_DE_UOP-1:0]                           vs1_eew;            
   logic   [`NUM_DE_UOP-1:0]                           vs1_index_valid;
   logic   [`NUM_DE_UOP-1:0]                           vs1_opcode_valid;
-  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vs2_index; 	        
+  logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vs2_index;
+  logic   [`NUM_DE_UOP-1:0][`UOP_INDEX_WIDTH-1:0]     vs2_offset;
   EEW_e   [`NUM_DE_UOP-1:0]                           vs2_eew;
   logic   [`NUM_DE_UOP-1:0]                           vs2_valid;
   logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] rd_index; 	        
@@ -4182,14 +4185,12 @@
     end
   end    
   
-  // update vd_index, eew and valid
+  // update vd_offset and valid
   always_comb begin
-    // initial
-    vd_index = 'b0;
-    vd_eew   = EEW_NONE;
-    vd_valid = 'b0;
-      
-    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD
+    vd_offset = 'b0;
+    vd_valid  = 'b0;
+
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD_OFFSET
       case(1'b1)
         valid_opi: begin
           case(funct6_ari.ari_funct6)
@@ -4212,9 +4213,8 @@
                 OPIVV,
                 OPIVX,
                 OPIVI: begin  
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end 
               endcase
             end
@@ -4230,9 +4230,8 @@
               case(inst_funct3)
                 OPIVV,
                 OPIVX: begin  
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end 
               endcase
             end
@@ -4242,9 +4241,8 @@
               case(inst_funct3)
                 OPIVX,
                 OPIVI: begin  
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
                 end 
               endcase
             end
@@ -4258,9 +4256,8 @@
                 OPIVV,
                 OPIVX,
                 OPIVI: begin  
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
                 end
               endcase
             end
@@ -4271,9 +4268,8 @@
               case(inst_funct3)
                 OPIVV,
                 OPIVX: begin  
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
                 end
               endcase
             end
@@ -4283,9 +4279,8 @@
               case(inst_funct3)
                 OPIVX,
                 OPIVI: begin  
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
                 end
               endcase
             end
@@ -4298,9 +4293,8 @@
                 OPIVV,
                 OPIVX,
                 OPIVI: begin
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4308,9 +4302,8 @@
             VWREDSUMU,
             VWREDSUM: begin
               if(inst_funct3==OPIVV) begin
-                vd_index[i]   = inst_vd;
-                vd_eew[i]     = eew_vd;
-                vd_valid[i]   = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+                vd_offset[i] = 'b0;
+                vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
               end
             end
 
@@ -4322,24 +4315,21 @@
                     {EMUL2,EMUL2},
                     {EMUL4,EMUL4},
                     {EMUL8,EMUL8}: begin
-                      vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                      vd_eew[i]   = eew_vd;
-                      vd_valid[i] = 1'b1;
+                      vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                      vd_valid[i]  = 1'b1;
                     end
                     {EMUL2,EMUL1},
                     {EMUL4,EMUL2},
                     {EMUL8,EMUL4}: begin
-                      vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                      vd_eew[i]   = eew_vd;
-                      vd_valid[i] = 1'b1;
+                      vd_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                      vd_valid[i]  = 1'b1;
                     end
                   endcase
                 end
                 OPIVX,
                 OPIVI: begin  
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end 
               endcase
             end
@@ -4382,9 +4372,8 @@
               case(inst_funct3)
                 OPMVV,
                 OPMVX: begin
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end   
@@ -4393,9 +4382,8 @@
             VCOMPRESS: begin
               case(inst_funct3)
                 OPMVV: begin
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4405,9 +4393,8 @@
             VSLIDE1DOWN: begin
               case(inst_funct3)
                 OPMVX: begin
-                  vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end 
@@ -4422,9 +4409,8 @@
             VREDXOR: begin
               case(inst_funct3)
                 OPMVV: begin
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
                 end
               endcase
             end
@@ -4439,9 +4425,8 @@
             VMXNOR: begin
               case(inst_funct3)
                 OPMVV: begin
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4449,9 +4434,8 @@
             VWXUNARY0: begin
               case(inst_funct3)
                 OPMVX: begin
-                  vd_index[i] = inst_vd;
-                  vd_eew[i]   = eew_vd;
-                  vd_valid[i] = 1'b1;
+                  vd_offset[i] = 'b0;
+                  vd_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4463,15 +4447,13 @@
                     VMSBF,
                     VMSIF,
                     VMSOF: begin
-                      vd_index[i] = inst_vd;
-                      vd_eew[i]   = eew_vd;
-                      vd_valid[i] = 1'b1;
+                      vd_offset[i] = 'b0;
+                      vd_valid[i]  = 1'b1;
                     end
                     VIOTA,
                     VID: begin
-                      vd_index[i] = inst_vd+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                      vd_eew[i]   = eew_vd;
-                      vd_valid[i] = 1'b1;
+                      vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                      vd_valid[i]  = 1'b1;
                     end
                   endcase
                 end
@@ -4483,6 +4465,14 @@
     end
   end
 
+  // update vd_index and eew 
+  always_comb begin
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD_OFFSET
+      vd_index[i] = inst_vd + {2'b0, vd_offset[i]};
+      vd_eew[i]   = eew_vd;
+    end
+  end
+
   // some uop need vd as the vs3 vector operand
   always_comb begin
     // initial
@@ -4610,14 +4600,12 @@
     end
   end
   
-  // update vs1 
+  // update vs1_offset and valid
   always_comb begin
-    // initial
-    vs1             = 'b0; 
-    vs1_eew         = EEW_NONE;
+    vs1_offset      = 'b0;
     vs1_index_valid = 'b0;
       
-    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS1
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS1_OFFSET
       case(inst_funct3)
         OPIVV: begin
           case(funct6_ari.ari_funct6)
@@ -4652,25 +4640,22 @@
             VSSRL,
             VSSRA,
             VRGATHER: begin
-              vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 1'b1;   
+              vs1_offset[i]      = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              vs1_index_valid[i] = 1'b1;
             end
             
             VNSRL,
             VNSRA,
             VNCLIPU,
             VNCLIP: begin
-              vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 1'b1;
+              vs1_offset[i]      = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+              vs1_index_valid[i] = 1'b1;
             end
             
             VWREDSUMU,
             VWREDSUM: begin
-              vs1[i]              = inst_vs1;
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;   
+              vs1_offset[i]      = 'b0;
+              vs1_index_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
             end        
             
             VSLIDEUP_RGATHEREI16: begin
@@ -4679,16 +4664,14 @@
                 {EMUL2,EMUL2},
                 {EMUL4,EMUL4},
                 {EMUL8,EMUL8}: begin
-                  vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs1_eew[i]          = eew_vs1;
-                  vs1_index_valid[i]  = 1'b1;
+                  vs1_offset[i]      = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs1_index_valid[i] = 1'b1;
                 end
                 {EMUL2,EMUL1},
                 {EMUL4,EMUL2},
                 {EMUL8,EMUL4}: begin
-                  vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                  vs1_eew[i]          = eew_vs1;
-                  vs1_index_valid[i]  = 1'b1;
+                  vs1_offset[i]      = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                  vs1_index_valid[i] = 1'b1;
                 end
               endcase
             end
@@ -4711,17 +4694,15 @@
             VWMACCU,
             VWMACC,
             VWMACCSU: begin
-              vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 1'b1;        
+              vs1_offset[i]      = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+              vs1_index_valid[i] = 1'b1;
             end
 
             VXUNARY0,
             VWXUNARY0,
             VMUNARY0: begin
-              vs1[i]              = inst_vs1; // vs1 is regarded as opcode
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 'b0;        
+              vs1_offset[i]      = 'b0; // vs1 is regarded as opcode
+              vs1_index_valid[i] = 'b0;
             end
 
             VMUL,
@@ -4740,9 +4721,8 @@
             VAADD,
             VASUBU,
             VASUB: begin
-              vs1[i]              = inst_vs1+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 1'b1;        
+              vs1_offset[i]      = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              vs1_index_valid[i] = 1'b1;
             end
 
             // reduction
@@ -4754,9 +4734,8 @@
             VREDAND,
             VREDOR,
             VREDXOR: begin
-              vs1[i]              = inst_vs1;
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
+              vs1_offset[i]      = 'b0;
+              vs1_index_valid[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_index_max;
             end
 
             VMAND,
@@ -4767,16 +4746,14 @@
             VMNOR,
             VMORN,
             VMXNOR: begin
-              vs1[i]              = inst_vs1;
-              vs1_eew[i]          = eew_vs1;
-              vs1_index_valid[i]  = 1'b1;
+              vs1_offset[i]      = 'b0;
+              vs1_index_valid[i] = 1'b1;
             end
 
             VCOMPRESS: begin
               if (uop_index_current[i][`UOP_INDEX_WIDTH-1:0] == uop_vstart) begin
-                vs1[i]              = inst_vs1;
-                vs1_eew[i]          = eew_vs1;
-                vs1_index_valid[i]  = 1'b1;        
+                vs1_offset[i]      = 'b0;
+                vs1_index_valid[i] = 1'b1;
               end
             end
           endcase
@@ -4785,6 +4762,14 @@
     end
   end
 
+  // update vs1(index or opcode) and eew
+  always_comb begin 
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS1
+      vs1[i]     = inst_vs1 + {2'b0, vs1_offset[i]}; 
+      vs1_eew[i] = eew_vs1; 
+    end
+  end
+
   // some uop will use vs1 field as an opcode to decode  
   always_comb begin
     // initial
@@ -4830,14 +4815,13 @@
     end
   end
 
-  // update vs2 index, eew and valid  
+  // update vs2 offset and valid  
   always_comb begin
     // initial
-    vs2_index = 'b0; 
-    vs2_eew   = EEW_NONE;
+    vs2_offset = 'b0;
     vs2_valid = 'b0; 
       
-    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2_OFFSET
       case(1'b1)
         valid_opi: begin
           // OPI*
@@ -4869,9 +4853,8 @@
                 OPIVV,
                 OPIVX,
                 OPIVI: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4890,9 +4873,8 @@
               case(inst_funct3)
                 OPIVV,
                 OPIVX: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4904,9 +4886,8 @@
               case(inst_funct3)
                 OPIVX,
                 OPIVI: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4917,8 +4898,7 @@
                 OPIVX,
                 OPIVI: begin
                   if(inst_vm==1'b0) begin
-                    vs2_index[i]  = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                    vs2_eew[i]    = eew_vs2;
+                    vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
                     vs2_valid[i]  = 1'b1;
                   end
                 end
@@ -4929,9 +4909,8 @@
             VWREDSUM: begin
               case(inst_funct3)
                 OPIVV: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -4944,24 +4923,21 @@
                     {EMUL2,EMUL2},
                     {EMUL4,EMUL4},
                     {EMUL8,EMUL8}: begin
-                      vs2_index[i]  = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                      vs2_eew[i]    = eew_vs2;
+                      vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
                       vs2_valid[i]  = 1'b1;
                     end
                     {EMUL2,EMUL1},
                     {EMUL4,EMUL2},
                     {EMUL8,EMUL4}: begin
-                      vs2_index[i]  = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                      vs2_eew[i]    = eew_vs2;
+                      vs2_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
                       vs2_valid[i]  = 1'b1;
                     end
                   endcase
                 end
                 OPIVX,
                 OPIVI: begin  
-                  vs2_index[i] = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]   = eew_vs2;
-                  vs2_valid[i] = 1'b1;
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end 
               endcase
             end
@@ -4984,9 +4960,8 @@
               case(inst_funct3)
                 OPMVV,
                 OPMVX: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;        
+                  vs2_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5014,9 +4989,8 @@
               case(inst_funct3)
                 OPMVV,
                 OPMVX: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;        
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5028,20 +5002,17 @@
                     {EMUL1,EMUL1},
                     {EMUL2,EMUL1},
                     {EMUL4,EMUL1}: begin
-                      vs2_index[i]    = inst_vs2;
-                      vs2_eew[i]      = eew_vs2;
-                      vs2_valid[i]    = 1'b1;
+                      vs2_offset[i] = 'b0;
+                      vs2_valid[i]  = 1'b1;
                     end
                     {EMUL4,EMUL2},
                     {EMUL8,EMUL4}: begin
-                      vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                      vs2_eew[i]      = eew_vs2;
-                      vs2_valid[i]    = 1'b1;
+                      vs2_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                      vs2_valid[i]  = 1'b1;
                     end
                     {EMUL8,EMUL2}: begin
-                      vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:2];
-                      vs2_eew[i]      = eew_vs2;
-                      vs2_valid[i]    = 1'b1;
+                      vs2_offset[i] = {2'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:2]};
+                      vs2_valid[i]  = 1'b1;
                     end
                   endcase
                 end
@@ -5051,9 +5022,8 @@
             VWMACCUS: begin
               case(inst_funct3)
                 OPMVX: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;        
+                  vs2_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5070,9 +5040,8 @@
             VCOMPRESS: begin
               case(inst_funct3)
                 OPMVV: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;   
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5087,9 +5056,8 @@
             VMXNOR: begin
               case(inst_funct3)
                 OPMVV: begin
-                  vs2_index[i]    = inst_vs2;
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;   
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5102,9 +5070,8 @@
                     VMSIF,
                     VMSOF,
                     VIOTA: begin
-                      vs2_index[i]    = inst_vs2;
-                      vs2_eew[i]      = eew_vs2;
-                      vs2_valid[i]    = 1'b1;   
+                      vs2_offset[i] = 'b0;
+                      vs2_valid[i]  = 1'b1;
                     end
                   endcase
                 end
@@ -5115,9 +5082,8 @@
             VSLIDE1DOWN: begin
               case(inst_funct3)
                 OPMVX: begin
-                  vs2_index[i]    = inst_vs2+uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                  vs2_eew[i]      = eew_vs2;
-                  vs2_valid[i]    = 1'b1;        
+                  vs2_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  vs2_valid[i]  = 1'b1;
                 end
               endcase
             end
@@ -5127,6 +5093,14 @@
     end
   end
 
+  // update vs2 index and eew
+  always_comb begin
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2
+      vs2_index[i] = inst_vs2 + {2'b0, vs2_offset[i]};
+      vs2_eew[i]   = eew_vs2;
+    end
+  end
+
   // update rd_index and valid
   always_comb begin
     // initial
diff --git a/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv b/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
index a940fe2..c39222e 100644
--- a/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
+++ b/hdl/verilog/rvv/design/rvv_backend_decode_unit_lsu.sv
@@ -95,6 +95,7 @@
   logic   [`NUM_DE_UOP-1:0]                           vm;                 
   logic   [`NUM_DE_UOP-1:0]                           v0_valid;           
   logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vd_index;           
+  logic   [`NUM_DE_UOP-1:0][`UOP_INDEX_WIDTH-1:0]     vd_offset;
   EEW_e   [`NUM_DE_UOP-1:0]                           vd_eew;  
   logic   [`NUM_DE_UOP-1:0]                           vd_valid;
   logic   [`NUM_DE_UOP-1:0]                           vs3_valid;          
@@ -103,6 +104,7 @@
   logic   [`NUM_DE_UOP-1:0]                           vs1_index_valid;
   logic   [`NUM_DE_UOP-1:0]                           vs1_opcode_valid;
   logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] vs2_index; 	        
+  logic   [`NUM_DE_UOP-1:0][`UOP_INDEX_WIDTH-1:0]     vs2_offset;
   EEW_e   [`NUM_DE_UOP-1:0]                           vs2_eew;
   logic   [`NUM_DE_UOP-1:0]                           vs2_valid;
   logic   [`NUM_DE_UOP-1:0][`REGFILE_INDEX_WIDTH-1:0] rd_index; 	        
@@ -228,10 +230,11 @@
             US_US,
             US_FF: begin
               case(inst_nf)
-                // EMUL_vd = ceil( inst_funct3/csr_sew*csr_lmul )
-                // emul_max_vd_vs2 = EMUL_vd
-                // emul_vd_nf = EMUL_vd*NF
-                // EMUL_max = NF*emul_max_vd_vs2
+                // emul_vd = ceil(inst_funct3/csr_sew*csr_lmul)
+                // emul_vs2: no emul_vs2 for unit
+                // emul_max_vd_vs2 = max(emul_vd,emul_vs2) = emul_vd
+                // emul_vd_nf = NF*emul_vd
+                // emul_max = NF*emul_max_vd_vs2
                 NF1: begin
                   case({inst_funct3,csr_sew})
                     // 1:1
@@ -1028,8 +1031,11 @@
 
         CS: begin
           case(inst_nf)
-            // EMUL_vd = ceil( inst_funct3/csr_sew*csr_lmul )
-            // EMUL_max = NF*EMUL_vd
+            // emul_vd = ceil(inst_funct3/csr_sew*csr_lmul)
+            // emul_vs2: no emul_vs2 for stride
+            // emul_max_vd_vs2 = max(emul_vd,emul_vs2) = emul_vd
+            // emul_vd_nf = NF*emul_vd
+            // emul_max = NF*emul_max_vd_vs2
             NF1: begin
               case({inst_funct3,csr_sew})
                 // 1:1
@@ -1782,10 +1788,11 @@
         IU,
         IO: begin
           case(inst_nf)
-            // EMUL_vd  = ceil( csr_lmul )
-            // EMUL_vs2 = ceil( inst_funct3/csr_sew*csr_lmul )
+            // emul_vd  = ceil(csr_lmul)
+            // emul_vs2 = ceil(inst_funct3/csr_sew*csr_lmul)
             // emul_max_vd_vs2 = max(EMUL_vd,EMUL_vs2)
-            // EMUL_max = NF*emul_max_vd_vs2
+            // emul_vd_nf = NF*emul_vd
+            // emul_max = NF*emul_max_vd_vs2
             NF1: begin
               case({inst_funct3,csr_sew})
                 // 1:1
@@ -3233,63 +3240,194 @@
     end
   end
 
-  // update vd_index and eew 
+  // update vd_offset
   always_comb begin
-    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD_OFFSET
       // initial
-      vd_index[i] = 'b0;
-      vd_eew[i]   = eew_vd;
+      vd_offset[i] = 'b0;
 
       case(inst_funct6[2:0])
         UNIT_STRIDE: begin
           case(inst_umop)
             US_REGULAR,          
-            US_FAULT_FIRST,
+            US_FAULT_FIRST: begin
+              case({inst_nf,emul_vd})
+                {NF2,EMUL4}: begin 
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd4;
+                    3'd2   : vd_offset[i] = 3'd1;
+                    3'd3   : vd_offset[i] = 3'd5;
+                    3'd4   : vd_offset[i] = 3'd2;
+                    3'd5   : vd_offset[i] = 3'd6;
+                    3'd6   : vd_offset[i] = 3'd3;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF2,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd1;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF3,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd4;
+                    3'd3   : vd_offset[i] = 3'd1;
+                    3'd4   : vd_offset[i] = 3'd3;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF4,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd4;
+                    3'd3   : vd_offset[i] = 3'd6;
+                    3'd4   : vd_offset[i] = 3'd1;
+                    3'd5   : vd_offset[i] = 3'd3;
+                    3'd6   : vd_offset[i] = 3'd5;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase
+                end
+                default: 
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase
+            end
             US_WHOLE_REGISTER: begin
-              vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
             end
             US_MASK: begin
-              vd_index[i] = inst_vd;
+              vd_offset[i] = 'b0;
             end
           endcase
         end
 
         CONSTANT_STRIDE: begin
-          vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+          case({inst_nf,emul_vd})
+            {NF2,EMUL4}: begin 
+              case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                3'd1   : vd_offset[i] = 3'd4;
+                3'd2   : vd_offset[i] = 3'd1;
+                3'd3   : vd_offset[i] = 3'd5;
+                3'd4   : vd_offset[i] = 3'd2;
+                3'd5   : vd_offset[i] = 3'd6;
+                3'd6   : vd_offset[i] = 3'd3;
+                default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase   
+            end
+            {NF2,EMUL2}: begin
+              case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                3'd1   : vd_offset[i] = 3'd2;
+                3'd2   : vd_offset[i] = 3'd1;
+                default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase   
+            end
+            {NF3,EMUL2}: begin
+              case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                3'd1   : vd_offset[i] = 3'd2;
+                3'd2   : vd_offset[i] = 3'd4;
+                3'd3   : vd_offset[i] = 3'd1;
+                3'd4   : vd_offset[i] = 3'd3;
+                default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase   
+            end
+            {NF4,EMUL2}: begin
+              case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                3'd1   : vd_offset[i] = 3'd2;
+                3'd2   : vd_offset[i] = 3'd4;
+                3'd3   : vd_offset[i] = 3'd6;
+                3'd4   : vd_offset[i] = 3'd1;
+                3'd5   : vd_offset[i] = 3'd3;
+                3'd6   : vd_offset[i] = 3'd5;
+                default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase
+            end
+            default: 
+              vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+          endcase
         end
         
         UNORDERED_INDEX,
         ORDERED_INDEX: begin
-          case({inst_funct3,csr_sew})
+          case({eew_vs2,eew_vd})
             // EEW_vs2:EEW_vd=1:1
-            {SEW_8,SEW8},
-            {SEW_16,SEW16},
-            {SEW_32,SEW32},            
+            {EEW8,EEW8},
+            {EEW16,EEW16},
+            {EEW32,EEW32},
             // 1:2
-            {SEW_8,SEW16},
-            {SEW_16,SEW32},
+            {EEW8,EEW16},
+            {EEW16,EEW32},
             // 1:4
-            {SEW_8,SEW32}: begin            
-              vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+            {EEW8,EEW32}: begin            
+              case({inst_nf,emul_vd})
+                {NF2,EMUL4}: begin 
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd4;
+                    3'd2   : vd_offset[i] = 3'd1;
+                    3'd3   : vd_offset[i] = 3'd5;
+                    3'd4   : vd_offset[i] = 3'd2;
+                    3'd5   : vd_offset[i] = 3'd6;
+                    3'd6   : vd_offset[i] = 3'd3;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF2,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd1;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF3,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd4;
+                    3'd3   : vd_offset[i] = 3'd1;
+                    3'd4   : vd_offset[i] = 3'd3;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase   
+                end
+                {NF4,EMUL2}: begin
+                  case(uop_index_current[i][`UOP_INDEX_WIDTH-1:0])
+                    3'd1   : vd_offset[i] = 3'd2;
+                    3'd2   : vd_offset[i] = 3'd4;
+                    3'd3   : vd_offset[i] = 3'd6;
+                    3'd4   : vd_offset[i] = 3'd1;
+                    3'd5   : vd_offset[i] = 3'd3;
+                    3'd6   : vd_offset[i] = 3'd5;
+                    default: vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+                  endcase
+                end
+                default: 
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
+              endcase
             end
             // 2:1
-            {SEW_16,SEW8},
-            {SEW_32,SEW16},
+            {EEW16,EEW8},
+            {EEW32,EEW16},
             // 4:1
-            {SEW_32,SEW8}: begin            
+            {EEW32,EEW8}: begin            
               case({emul_vs2,emul_vd})
-                {EMUL1,EMUL1}: begin
-                  vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
-                end
+                {EMUL1,EMUL1}: 
+                  vd_offset[i] = uop_index_current[i][`UOP_INDEX_WIDTH-1:0];
                 {EMUL2,EMUL1},
-                {EMUL4,EMUL2},
-                {EMUL8,EMUL4}: begin
-                  vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:1];
+                {EMUL8,EMUL4}: 
+                  vd_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                {EMUL4,EMUL2}: begin
+                  if (inst_nf==NF2) begin
+                    case(uop_index_current[i][`UOP_INDEX_WIDTH-1:1])
+                      2'd1   : vd_offset[i] = 3'd2;
+                      2'd2   : vd_offset[i] = 3'd1;
+                      default: vd_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
+                    endcase   
+                  end
+                  else
+                    vd_offset[i] = {1'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:1]};
                 end
                 {EMUL4,EMUL1},
-                {EMUL8,EMUL2}: begin
-                  vd_index[i] = inst_vd + uop_index_current[i][`UOP_INDEX_WIDTH-1:2];
-                end
+                {EMUL8,EMUL2}: 
+                  vd_offset[i] = {2'b0, uop_index_current[i][`UOP_INDEX_WIDTH-1:2]};
               endcase
             end
           endcase
@@ -3298,6 +3436,14 @@
     end
   end
 
+  // update vd_index and eew
+  always_comb begin
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VD
+      vd_index[i] = inst_vd + {2'b0, vd_offset[i]};
+      vd_eew[i]   = eew_vd;
+    end
+  end
+
   // update vd_valid and vs3_valid
   // some uop need vd as the vs3 vector operand
   always_comb begin
@@ -3330,83 +3476,126 @@
     end
   end
 
-  // update vs2 index, eew and valid  
+  // update vs2 offset and valid
   always_comb begin
     // initial
-    vs2_index = 'b0; 
-    vs2_eew   = EEW_NONE;
-    vs2_valid = 'b0; 
+    vs2_offset = 'b0;
+    vs2_valid  = 'b0;
     
-    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2_OFFSET
       case(inst_funct6[2:0])
         UNORDERED_INDEX,
         ORDERED_INDEX: begin
-          case({inst_funct3,csr_sew})
+          case({eew_vs2,eew_vd})
             // EEW_vs2:EEW_vd=1:1
-            {SEW_8,SEW8},
-            {SEW_16,SEW16},
-            {SEW_32,SEW32},            
-            // 2:1
-            {SEW_16,SEW8},
-            {SEW_32,SEW16},            
-            // 4:1
-            {SEW_32,SEW8}: begin    
+            {EEW8,EEW8},
+            {EEW16,EEW16},
+            {EEW32,EEW32}: begin
               case(emul_vs2)
-                EMUL1: begin
-                  vs2_index[i] = inst_vs2;
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
-                end
                 EMUL2: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][0];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  case(inst_nf)
+                    NF2: vs2_offset[i] = {2'b0, uop_index_current[i][1]};
+                    NF3: vs2_offset[i] = (uop_index_current[i]>='d3) ? 3'd1 : 3'b0;
+                    NF4: vs2_offset[i] = {2'b0, uop_index_current[i][2]};
+                    default: vs2_offset[i] = {2'b0, uop_index_current[i][0]};
+                  endcase
+                  vs2_valid[i]  = 1'b1; 
                 end
                 EMUL4: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][1:0];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = (inst_nf==NF2) ? {1'b0, uop_index_current[i][2:1]} : {1'b0, uop_index_current[i][1:0]};
+                  vs2_valid[i]  = 1'b1; 
                 end
                 EMUL8: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][2:0];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = uop_index_current[i][2:0];
+                  vs2_valid[i]  = 1'b1; 
+                end
+                default: begin //EMUL1
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1; 
+                end
+              endcase
+            end           
+            // 2:1
+            {EEW16,EEW8},
+            {EEW32,EEW16}: begin
+              case(emul_vs2)
+                EMUL2: begin
+                  case(inst_nf)
+                    NF2: vs2_offset[i] = {1'b0, uop_index_current[i][2], uop_index_current[i][0]};
+                    NF3,
+                    NF4: vs2_offset[i] = {2'b0, uop_index_current[i][0]};
+                    default: vs2_offset[i] = uop_index_current[i];  //NF1
+                  endcase
+                  vs2_valid[i]  = 1'b1; 
+                end
+                EMUL4: begin
+                  vs2_offset[i] = (inst_nf==NF2) ? {1'b0, uop_index_current[i][2], uop_index_current[i][0]} : uop_index_current[i];
+                  vs2_valid[i]  = 1'b1; 
+                end
+                EMUL8: begin
+                  vs2_offset[i] = uop_index_current[i];
+                  vs2_valid[i]  = 1'b1; 
+                end
+                default: begin //EMUL1
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1; 
+                end
+              endcase
+            end          
+            // 4:1
+            {EEW32,EEW8}: begin    
+              case(emul_vs2)
+                EMUL2: begin
+                  case(inst_nf)
+                    NF2: vs2_offset[i] = {1'b0, uop_index_current[i][2], uop_index_current[i][0]};
+                    NF3,
+                    NF4: vs2_offset[i] = {2'b0, uop_index_current[i][0]};
+                    default: vs2_offset[i] = uop_index_current[i];  //NF1
+                  endcase
+                  vs2_valid[i]  = 1'b1; 
+                end
+                EMUL4: begin
+                  vs2_offset[i] = (inst_nf==NF2) ? {1'b0, uop_index_current[i][1:0]} : uop_index_current[i];
+                  vs2_valid[i]  = 1'b1; 
+                end
+                EMUL8: begin
+                  vs2_offset[i] = uop_index_current[i];
+                  vs2_valid[i]  = 1'b1; 
+                end
+                default: begin //EMUL1
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1; 
                 end
               endcase
             end
             // 1:2
-            {SEW_8,SEW16},
-            {SEW_16,SEW32}: begin
+            {EEW8,EEW16},
+            {EEW16,EEW32}: begin
               case(emul_vs2)
                 EMUL1: begin
-                  vs2_index[i] = inst_vs2;
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1; 
                 end
                 EMUL2: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][1];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = (inst_nf==NF2) ? {2'b0, uop_index_current[i][2]} : {2'b0, uop_index_current[i][1]};
+                  vs2_valid[i]  = 1'b1; 
                 end
                 EMUL4: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][2:1];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = {1'b0, uop_index_current[i][2:1]};
+                  vs2_valid[i]  = 1'b1; 
                 end
               endcase
             end
             // 1:4
-            {SEW_8,SEW32}: begin     
+            {EEW8,EEW32}: begin     
               case(emul_vs2)
                 EMUL1: begin
-                  vs2_index[i] = inst_vs2;
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = 'b0;
+                  vs2_valid[i]  = 1'b1; 
                 end
                 EMUL2: begin
-                  vs2_index[i] = inst_vs2+uop_index_current[i][2];
-                  vs2_eew[i]   = eew_vs2; 
-                  vs2_valid[i] = 1'b1; 
+                  vs2_offset[i] = {2'b0, uop_index_current[i][2]};
+                  vs2_valid[i]  = 1'b1; 
                 end
               endcase
             end
@@ -3416,6 +3605,14 @@
     end
   end
 
+  // update vs2 index and eew 
+  always_comb begin
+    for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_VS2
+      vs2_index[i] = inst_vs2 + {2'b0, vs2_offset[i]}; 
+      vs2_eew[i]   = eew_vs2; 
+    end
+  end
+
   // update rd_index and valid
   always_comb begin
     for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_RD
@@ -3450,24 +3647,39 @@
   // update segment_index valid
   always_comb begin
     for(int i=0;i<`NUM_DE_UOP;i++) begin: GET_SEG_INDEX
-      // initial 
-      seg_field_index[i] = 'b0;
+      // default
+      if (inst_nf==NF2)
+        seg_field_index[i] = {1'b0,uop_index_current[i][2:1]};
+      else if (inst_nf==NF3)
+        seg_field_index[i] = (uop_index_current[i]>=4'd3) ? 'd1 : 'b0;
+      else if (inst_nf==NF4)
+        seg_field_index[i] = {2'b0,uop_index_current[i][2]};
+      else
+        seg_field_index[i] = 'b0;
 
-      if(funct6_lsu.lsu_funct6.lsu_is_seg==IS_SEGMENT) begin
-        case(inst_nf)
-          NF2: begin
-            case(emul_max_vd_vs2)
-              EMUL2: seg_field_index[i] = {1'b0,uop_index_current[i][0]};
-              EMUL4: seg_field_index[i] = uop_index_current[i][1:0];
-            endcase
-          end
-          NF3,
-          NF4: begin
-            if (emul_max_vd_vs2==EMUL2)
-              seg_field_index[i] = {1'b0,uop_index_current[i][0]};
-          end
-        endcase
-      end
+      // EEW_vs2>EEW_vd for index load/store
+      case(inst_funct6[2:0])
+        UNORDERED_INDEX,
+        ORDERED_INDEX: begin
+          case({eew_vs2,eew_vd})
+            // 2:1
+            {EEW16,EEW8},
+            {EEW32,EEW16}: begin
+              case(emul_vs2)
+                EMUL2: seg_field_index[i] = {2'b0, uop_index_current[i][0]};
+                EMUL4: seg_field_index[i] = {1'b0, uop_index_current[i][2], uop_index_current[i][0]};
+              endcase
+            end
+            // 4:1
+            {EEW32,EEW8}: begin
+              case(emul_vs2)
+                EMUL2: seg_field_index[i] = {2'b0, uop_index_current[i][0]};
+                EMUL4: seg_field_index[i] = {1'b0, uop_index_current[i][1:0]};
+              endcase
+            end
+          endcase
+        end
+      endcase
     end
   end
 
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_driver.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_driver.sv
index 0e12dd4..0e50dfa 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_driver.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_driver.sv
@@ -8,6 +8,7 @@
 
 class lsu_driver extends uvm_driver # (lsu_transaction);
 
+  parameter int MAX_SEG = 8;
   typedef virtual lsu_interface v_if; 
   v_if lsu_if;
   
@@ -42,6 +43,8 @@
   // receive & decode inst from rvs
   extern function void write_lsu_inst(rvs_transaction inst_tr);
   extern function int lsu_uop_decode(ref rvs_transaction inst_tr);
+  extern protected function void lsu_uop_gen_delay(ref lsu_transaction uop_tr);
+  extern protected function void lsu_uop_gen_trap(ref lsu_transaction uop_tr);
 
 endclass: lsu_driver
 
@@ -153,17 +156,17 @@
             // update address for indexed-stride from vidx_data
             if(uop_tr.is_indexed == 1) begin
               if(lsu_if.uop_lsu_rvv2lsu[i].vidx_valid !== 1) begin
-                `uvm_fatal("LSU_DRV", "Uop is indexed but vidx_valid is not")
+                `uvm_error("LSU_DRV", "Uop is indexed but vidx_valid is not")
                 continue;
               end else if(uop_tr.lsu_slot_addr_valid === 1) begin
                 `uvm_fatal("TB_ISSUE", "Decode error")
                 continue;
               end else if(uop_tr.vidx_vreg_idx !== lsu_if.uop_lsu_rvv2lsu[i].vidx_addr) begin
-                `uvm_fatal("LSU_DRV", $sformatf("vidx_addr mismatch: lsu=%0d, dut=%0d", uop_tr.vidx_vreg_idx, lsu_if.uop_lsu_rvv2lsu[i].vidx_addr))
+                `uvm_error("LSU_DRV", $sformatf("vidx_addr mismatch: lsu=%0d, dut=%0d", uop_tr.vidx_vreg_idx, lsu_if.uop_lsu_rvv2lsu[i].vidx_addr))
                 continue;
               end else begin
                 `uvm_info("LSU_DRV", $sformatf("Got vreg[%0d]=0x%16x from dut.", lsu_if.uop_lsu_rvv2lsu[i].vidx_addr, lsu_if.uop_lsu_rvv2lsu[i].vidx_data), UVM_HIGH);
-                for(int byte_idx=uop_tr.vidx_vreg_byte_start; byte_idx<=uop_tr.vidx_vreg_byte_end; byte_idx += uop_tr.vidx_vreg_eew/8) begin
+                for(int byte_idx=uop_tr.vidx_vreg_byte_head; byte_idx<uop_tr.vidx_vreg_byte_tail; byte_idx += uop_tr.vidx_vreg_eew/8) begin
                   case(uop_tr.vidx_vreg_eew)
                     // For indexed-stride, the stride from vrf should be zero-extended to `XLEN.
                     EEW8 : stride_temp = $unsigned(lsu_if.uop_lsu_rvv2lsu[i].vidx_data[byte_idx*8 +: 8 ]);
@@ -173,7 +176,7 @@
                   indexed_stride.push_back(stride_temp);
                   `uvm_info("LSU_DRV", $sformatf("byte[%0d]: push stride=0x%8x to indexed_stride(size: %0d).", byte_idx, stride_temp, indexed_stride.size()), UVM_HIGH)
                 end
-                for(int byte_idx=uop_tr.data_vreg_byte_start; byte_idx<=uop_tr.data_vreg_byte_end; byte_idx++) begin
+                for(int byte_idx=uop_tr.data_vreg_byte_head; byte_idx<uop_tr.data_vreg_byte_tail; byte_idx++) begin
                   if(byte_idx % (uop_tr.data_vreg_eew/8) == 0) begin 
                     stride_temp = indexed_stride.pop_front();
                     `uvm_info("LSU_DRV", $sformatf("byte[%0d]: pop stride=0x%8x from indexed_stride(size: %0d).", byte_idx, stride_temp, indexed_stride.size()), UVM_HIGH)
@@ -198,7 +201,7 @@
                 `uvm_fatal("TB_ISSUE", $sformatf("vregfile_read_addr mismatch: lsu=%0d, dut=%0d", uop_tr.data_vreg_idx, lsu_if.uop_lsu_rvv2lsu[i].vregfile_read_addr))
                 continue;
               end else begin
-                for(int byte_idx=uop_tr.data_vreg_byte_start; byte_idx<=uop_tr.data_vreg_byte_end; byte_idx++) begin
+                for(int byte_idx=uop_tr.data_vreg_byte_head; byte_idx<uop_tr.data_vreg_byte_tail; byte_idx++) begin
                   uop_tr.lsu_slot_data[byte_idx] = lsu_if.uop_lsu_rvv2lsu[i].vregfile_read_data[byte_idx*8 +: 8];
                 end
                 uop_tr.lsu_slot_data_valid = 1;
@@ -211,6 +214,7 @@
                 `uvm_fatal("LSU_DRV", "Uops need v0_data but v0_valid is 0")
                 continue;
               end else begin
+                `uvm_info("LSU_DRV", $sformatf("uop_pc:0x%8x, v0_data=0x%016x", uop_tr.uop_pc, lsu_if.uop_lsu_rvv2lsu[i].v0_data), UVM_HIGH)
                 uop_tr.lsu_slot_strobe = lsu_if.uop_lsu_rvv2lsu[i].v0_data;
                 uop_tr.lsu_slot_addr_valid = 1;
               end
@@ -286,8 +290,7 @@
                 `uvm_fatal("TB_ISSUE", "LSU decode err.")
                 break;
               end else if(uops_tx_queue[uop_idx].uop_done == 0) begin
-                // for(int byte_idx=0; byte_idx<`VLENB; byte_idx++) begin
-                for(int byte_idx=uops_tx_queue[uop_idx].data_vreg_byte_start; byte_idx<=uops_tx_queue[uop_idx].data_vreg_byte_end; byte_idx++) begin
+                for(int byte_idx=uops_tx_queue[uop_idx].data_vreg_byte_head; byte_idx<uops_tx_queue[uop_idx].data_vreg_byte_tail; byte_idx++) begin
                   if(uops_tx_queue[uop_idx].lsu_slot_strobe[byte_idx] === 1'b1) begin
                     mem.pc = uops_tx_queue[uop_idx].uop_pc;
                     mem.load_byte(data_temp, uops_tx_queue[uop_idx].lsu_slot_addr[byte_idx]);
@@ -308,8 +311,7 @@
                 `uvm_fatal("TB_ISSUE", "LSU decode err.")
                 break;
               end else if(uops_tx_queue[uop_idx].uop_done == 0) begin
-                // for(int byte_idx=0; byte_idx<`VLENB; byte_idx++) begin
-                for(int byte_idx=uops_tx_queue[uop_idx].data_vreg_byte_start; byte_idx<=uops_tx_queue[uop_idx].data_vreg_byte_end; byte_idx++) begin
+                for(int byte_idx=uops_tx_queue[uop_idx].data_vreg_byte_head; byte_idx<uops_tx_queue[uop_idx].data_vreg_byte_tail; byte_idx++) begin
                   if(uops_tx_queue[uop_idx].lsu_slot_strobe[byte_idx] === 1'b1) begin
                     data_temp = uops_tx_queue[uop_idx].lsu_slot_data[byte_idx];
                     mem.pc = uops_tx_queue[uop_idx].uop_pc;
@@ -432,17 +434,14 @@
 endfunction
 
 function int lsu_driver::lsu_uop_decode(ref rvs_transaction inst_tr);
-  lsu_transaction uop_tr;
+  lsu_transaction uop_tr; // segment max is 3
   // vtype decode
-  int sew;
-  int lsu_eew;
-  real lmul;
   int elm_idx_max;
   int lsu_nf;
-  int seg_idx_max;
+  int seg_num;
   int evl;
   int vstart;
-  int uop_vstart;
+  int uop_vstart[MAX_SEG];
 
   int  data_eew;
   real data_emul;
@@ -452,15 +451,19 @@
   real emul_max;
 
   // uop info
-  int uops_cnt;
   int uops_num;
-  int data_byte_idx;
-  int vidx_byte_idx;
-  int temp_idx;
   int data_vreg_idx_base;
   int vidx_vreg_idx_base;
   int data_vreg_idx_last;
   int vidx_vreg_idx_last;
+  int elm_per_uop;
+  int elm_idx_head[MAX_SEG]; // elm pointer
+  int elm_idx_tail[MAX_SEG]; // elm pointer
+  int data_byte_idx[MAX_SEG];
+  int vidx_byte_idx[MAX_SEG];
+  int seg_idx;
+  int switch_seg;
+  int seg_switch_gap; // num of bytes per data vreg
 
   // load/store addres info
   int addr;
@@ -468,87 +471,66 @@
 
 // Decode ---------------------------------------------------------------------- 
   `uvm_info("LSU_DRV","Start decode vtype",UVM_HIGH)
-  sew = 8 << inst_tr.vsew;
-  lsu_eew = inst_tr.lsu_eew;
-  lmul = 2.0 ** $signed(inst_tr.vlmul);
 
-  addr_base = inst_tr.rs1_data;
-  lsu_nf = inst_tr.lsu_nf;
-  vstart = inst_tr.vstart;
+  addr_base   = inst_tr.rs1_data;
+  evl         = inst_tr.evl;
+  vstart      = inst_tr.vstart;
+  lsu_nf      = inst_tr.lsu_nf;
 
+  eew_max     = inst_tr.eew_max;
+  emul_max    = inst_tr.emul_max;
+
+  elm_idx_max = inst_tr.elm_idx_max;
+  seg_num     = inst_tr.seg_num;
+
+  uops_num    = int'($ceil(emul_max)) * (seg_num);
+  elm_per_uop = `VLEN / eew_max;
+
+  case(inst_tr.inst_type)
+    LD: begin
+      data_eew  = inst_tr.dest_eew;
+      vidx_eew  = inst_tr.src2_eew;
+      data_emul = inst_tr.dest_emul;
+      vidx_emul = inst_tr.src2_emul;
+    end
+    ST: begin
+      data_eew  = inst_tr.src3_eew;
+      vidx_eew  = inst_tr.src2_eew;
+      data_emul = inst_tr.src3_emul;
+      vidx_emul = inst_tr.src2_emul;
+    end
+  endcase
   case(inst_tr.lsu_mop) 
     LSU_US   : begin
       case(inst_tr.lsu_umop)
         MASK: begin
-          data_eew  = EEW8;
-          data_emul = EMUL1;
-          vidx_eew  = EEW32;
-          vidx_emul = EMUL1;
-          eew_max   = EEW8;
-          emul_max  = EMUL1;
           const_stride = (lsu_nf+1) * data_eew/8;
-          seg_idx_max  = lsu_nf + 1;
-          evl = int'($ceil(inst_tr.vl / 8.0));
         end
         WHOLE_REG: begin
-          data_eew  = lsu_eew;
-          data_emul = lsu_nf + 1;
-          vidx_eew  = EEW32;
-          vidx_emul = EMUL1;
-          eew_max   = lsu_eew;
-          emul_max  = data_emul;
           const_stride = data_eew/8;
-          seg_idx_max  = 1;
-          evl = data_emul * `VLEN / data_eew;
         end
         default: begin
-          data_eew = lsu_eew;
-          data_emul = data_eew * lmul / sew;
-          vidx_eew = EEW32;
-          vidx_emul = EMUL1;
-          eew_max  = lsu_eew;
-          emul_max = eew_max * lmul / sew;
           const_stride = (lsu_nf+1) * data_eew/8;
-          seg_idx_max  = lsu_nf + 1;
-          evl = inst_tr.vl;
         end
       endcase
     end
     LSU_CS  : begin
-      data_eew = lsu_eew;
-      data_emul = data_eew * lmul / sew;
-      vidx_eew = EEW32;
-      vidx_emul = EMUL1;
-      eew_max  = lsu_eew;
-      emul_max = eew_max * lmul / sew;
       const_stride = inst_tr.rs2_data;
-      seg_idx_max  = lsu_nf + 1;
-      evl = inst_tr.vl;
     end
     LSU_UI, 
     LSU_OI: begin
-      data_eew = sew;
-      data_emul = data_eew * lmul / sew;
-      vidx_eew = lsu_eew;
-      vidx_emul = vidx_eew * lmul / sew;
-      eew_max  = (data_eew > vidx_eew) ? data_eew : vidx_eew;
-      emul_max = eew_max * lmul / sew;
       const_stride = 0;
-      seg_idx_max  = lsu_nf + 1;
-      evl = inst_tr.vl;
     end      
   endcase
-  uops_num = int'($ceil(emul_max)) * (seg_idx_max);
-  elm_idx_max = int'($ceil(emul_max)) * `VLEN / eew_max;
   
   if(inst_tr.inst_type == LD) begin
     data_vreg_idx_base = inst_tr.dest_idx;
-    data_vreg_idx_last = inst_tr.dest_idx + (seg_idx_max) * int'($ceil(data_emul)) - 1;
+    data_vreg_idx_last = inst_tr.dest_idx + (seg_num) * int'($ceil(data_emul)) - 1;
     vidx_vreg_idx_base = inst_tr.src2_idx;
     vidx_vreg_idx_last = inst_tr.src2_idx + int'($ceil(vidx_emul)) - 1;
   end else if(inst_tr.inst_type == ST) begin
     data_vreg_idx_base = inst_tr.src3_idx;
-    data_vreg_idx_last = inst_tr.src3_idx + (seg_idx_max) * int'($ceil(data_emul)) - 1;
+    data_vreg_idx_last = inst_tr.src3_idx + (seg_num) * int'($ceil(data_emul)) - 1;
     vidx_vreg_idx_base = inst_tr.src2_idx;
     vidx_vreg_idx_last = inst_tr.src2_idx + int'($ceil(vidx_emul)) - 1;
   end else begin
@@ -557,164 +539,198 @@
   `uvm_info("LSU_DRV", $sformatf("eew_max=%0d, emul_max=%.2f, elm_idx_max=%0d", eew_max, emul_max, elm_idx_max), UVM_HIGH)
 
 // Uops Gen --------------------------------------------------------------------
-  `uvm_info("LSU_DRV","Start gen uops",UVM_HIGH)
-  uops_cnt = 0;
-  for(int seg_idx=0; seg_idx<seg_idx_max; seg_idx++) begin
-    uop_vstart = inst_tr.vstart;
-    data_byte_idx = vstart * data_eew / 8;
-    vidx_byte_idx = vstart * vidx_eew / 8;
-    for(int elm_idx=0; elm_idx<elm_idx_max; elm_idx++) begin
-      
-      `uvm_info("LSU_DRV",$sformatf("seg_idx=%0d, elm_idx=%0d", seg_idx, elm_idx),UVM_HIGH)
-      if(elm_idx * eew_max % `VLEN == 0) begin
-        `uvm_info("LSU_DRV","Gen new uop",UVM_HIGH)
-        uop_tr = new();
-        // Gen delay
-        case(delay_mode_rvv2lsu)
-          delay_mode_pkg::SLOW: begin
-            uop_tr.c_rvv2lsu_delay.constraint_mode(0);
-            assert(uop_tr.randomize(rvv2lsu_delay) with {
-              rvv2lsu_delay dist {
-                [1:50] :/ 20,
-                [50:100] :/ 80
-              };
-            });
-          end
-          delay_mode_pkg::NORMAL: begin
-            assert(uop_tr.randomize(rvv2lsu_delay) with {
-              rvv2lsu_delay dist {
-                [0:10] :/ 50,
-                [10:20] :/ 30,
-                [20:50] :/ 20
-              };
-            });
-          end
-          delay_mode_pkg::FAST: begin
-            assert(uop_tr.randomize(rvv2lsu_delay) with {
-              rvv2lsu_delay dist {
-                0      := 80,
-                [1:5]  :/ 15,
-                [5:20] :/ 5
-              };
-            });
-          end
-        endcase
-        case(delay_mode_lsu2rvv)
-          delay_mode_pkg::SLOW: begin
-            uop_tr.c_lsu2rvv_delay.constraint_mode(0);
-            assert(uop_tr.randomize(lsu2rvv_delay) with {
-              lsu2rvv_delay dist {
-                [1:50] :/ 20,
-                [50:100] :/ 80
-              };
-            });
-          end
-          delay_mode_pkg::NORMAL: begin
-            assert(uop_tr.randomize(lsu2rvv_delay) with {
-              lsu2rvv_delay dist {
-                [0:10] :/ 50,
-                [10:20] :/ 30,
-                [20:50] :/ 20
-              };
-            });
-          end
-          delay_mode_pkg::FAST: begin
-            assert(uop_tr.randomize(lsu2rvv_delay) with {
-              lsu2rvv_delay dist {
-                0      := 80,
-                [1:5]  :/ 15,
-                [5:20] :/ 5
-              };
-            });
-          end
-        endcase
-        // Gen trap
-        if(trap_en) begin
-          if(always_trap) begin
-            assert(uop_tr.randomize(trap_occured) with {
-              trap_occured == 1;
-            });
-          end else begin
-            assert(uop_tr.randomize(trap_occured) with {
-              trap_occured dist {
-                // 0 := 99,
-                0 := 9,
-                1 := 1
-              };
-            });
-          end
-        end else begin
-          assert(uop_tr.randomize(trap_occured) with {
-            trap_occured == 0;
-          });
-        end
-        uops_cnt++;
-        uop_tr.inst_string = inst_tr.asm_string;
-        if(inst_tr.inst_type == LD) begin
-          uop_tr.kind = lsu_transaction::LOAD;
-        end else if(inst_tr.inst_type == ST) begin
-          uop_tr.kind = lsu_transaction::STORE;
-        end else begin
-          `uvm_fatal("TB_ISSUE", "Decode inst_tr which is not load/store in lsu_driver.")
-        end
-        uop_tr.uop_pc = inst_tr.pc;
-        uop_tr.uop_index = uops_cnt-1;
+  `uvm_info("LSU_DRV","Start gen uops",UVM_HIGH)  
+  if(data_emul < 1) begin
+    seg_switch_gap = data_emul * `VLENB;
+  end else begin
+    seg_switch_gap = `VLENB;
+  end
 
-        uop_tr.is_last_uop = (uops_cnt == uops_num) ? 1: 0;
-        uop_tr.is_indexed = (inst_tr.lsu_mop inside {LSU_UI, LSU_OI}) ? 1 : 0;
-        uop_tr.total_uops_num = uops_num;
-        uop_tr.base_addr = addr_base;
-        uop_tr.vstart    = uop_vstart;
+  for(int seg_idx=0; seg_idx<seg_num; seg_idx++) begin
+    elm_idx_head[seg_idx]  = 0;
+    elm_idx_tail[seg_idx]  = elm_per_uop;
+    data_byte_idx[seg_idx] = 0;
+    vidx_byte_idx[seg_idx] = 0;
+    uop_vstart[seg_idx]    = vstart;
+  end
+  seg_idx = 0;
+  for(int uops_idx=0; uops_idx<uops_num; uops_idx++) begin
+    `uvm_info("LSU_DRV","Gen new uop",UVM_HIGH)
+    uop_tr = new();
+    `uvm_info("LSU_DRV", $sformatf("seg_idx           = %0d\n", seg_idx      ), UVM_HIGH)
+    `uvm_info("LSU_DRV", $sformatf("seg_switch_gap    = %0d\n", seg_switch_gap), UVM_HIGH)
+    `uvm_info("LSU_DRV", $sformatf("elm_idx_head[%0d] = %0d\n", seg_idx, elm_idx_head[seg_idx]), UVM_HIGH)
+    `uvm_info("LSU_DRV", $sformatf("elm_idx_tail[%0d] = %0d\n", seg_idx, elm_idx_tail[seg_idx]), UVM_HIGH)
 
-        uop_tr.vm = inst_tr.vm;
-        uop_tr.lsu_slot_strobe = '0;
-        
-        uop_tr.data_vreg_valid      = 1;
-        uop_tr.data_vreg_idx        = data_vreg_idx_base + elm_idx * (data_eew/8) / `VLENB + seg_idx * int'($ceil(data_emul));
-        uop_tr.data_vreg_eew        = data_eew; 
-        uop_tr.data_vreg_byte_start = data_byte_idx % `VLENB;
+    lsu_uop_gen_delay(uop_tr);
+    lsu_uop_gen_trap(uop_tr);
 
-        uop_tr.vidx_vreg_valid      = (inst_tr.lsu_mop inside {LSU_UI, LSU_OI}) ? 1 : 0;
-        uop_tr.vidx_vreg_idx        = vidx_vreg_idx_base + elm_idx * (vidx_eew/8) / `VLENB; 
-        uop_tr.vidx_vreg_eew        = vidx_eew; 
-        uop_tr.vidx_vreg_byte_start = vidx_byte_idx % `VLENB;
+    uop_tr.inst_string = inst_tr.asm_string;
+    if(inst_tr.inst_type == LD) begin
+      uop_tr.kind = lsu_transaction::LOAD;
+    end else if(inst_tr.inst_type == ST) begin
+      uop_tr.kind = lsu_transaction::STORE;
+    end else begin
+      `uvm_fatal("TB_ISSUE", "Decode inst_tr which is not load/store in lsu_driver.")
+    end
+    uop_tr.uop_pc               = inst_tr.pc;
+    uop_tr.uop_index            = uops_idx;
+
+    uop_tr.is_last_uop          = (uops_idx == uops_num-1) ? 1: 0;
+    uop_tr.is_indexed           = (inst_tr.lsu_mop inside {LSU_UI, LSU_OI}) ? 1 : 0;
+    uop_tr.total_uops_num       = uops_num;
+    uop_tr.base_addr            = addr_base;
+    uop_tr.vstart               = uop_vstart[seg_idx];
+
+    uop_tr.vm                   = inst_tr.vm;
+    uop_tr.lsu_slot_strobe      = '0;
+    
+    uop_tr.data_vreg_valid      = 1;
+    uop_tr.data_vreg_idx        = data_vreg_idx_base + elm_idx_head[seg_idx] * (data_eew/8) / `VLENB + seg_idx * int'($ceil(data_emul));
+    uop_tr.data_vreg_eew        = data_eew; 
+    uop_tr.data_vreg_byte_head  = data_byte_idx[seg_idx];
+
+    uop_tr.vidx_vreg_valid      = (inst_tr.lsu_mop inside {LSU_UI, LSU_OI}) ? 1 : 0;
+    uop_tr.vidx_vreg_idx        = vidx_vreg_idx_base + elm_idx_head[seg_idx] * (vidx_eew/8) / `VLENB; 
+    uop_tr.vidx_vreg_eew        = vidx_eew; 
+    uop_tr.vidx_vreg_byte_head  = vidx_byte_idx[seg_idx];
+
+    for(int elm_idx=elm_idx_head[seg_idx]; elm_idx<elm_idx_tail[seg_idx]; elm_idx++) begin
+      if(elm_idx == vstart) begin
+        uop_tr.data_vreg_byte_head = vstart * data_eew / 8 %`VLENB;
+        uop_tr.vidx_vreg_byte_head = vstart * vidx_eew / 8 %`VLENB;
       end
-
       if(elm_idx >= vstart && elm_idx < evl) begin
         for(int byte_idx=0; byte_idx<data_eew/8; byte_idx++) begin
           addr = addr_base + const_stride * elm_idx + data_eew / 8 * seg_idx + byte_idx;
-          temp_idx = data_byte_idx % `VLENB;
-          uop_tr.lsu_slot_addr[temp_idx] = addr;
-          uop_tr.lsu_slot_strobe[temp_idx] = 1'b1;
-          data_byte_idx++;
-          // `uvm_info("LSU_DRV",$sformatf("addr=%0x, data_byte_idx = %0d, temp_idx=%0d", addr, data_byte_idx, temp_idx),UVM_HIGH)
-          // uop_tr.print();
+          uop_tr.lsu_slot_addr[data_byte_idx[seg_idx]] = addr;
+          uop_tr.lsu_slot_strobe[data_byte_idx[seg_idx]] = 1'b1;
+          data_byte_idx[seg_idx]++;
         end
-        vidx_byte_idx += vidx_eew/8;
+        vidx_byte_idx[seg_idx] += vidx_eew/8;
+      end else begin
+        data_byte_idx[seg_idx] += data_eew/8;
+        vidx_byte_idx[seg_idx] += vidx_eew/8;
       end
       if(elm_idx >= vstart) begin
-        uop_vstart++;
+        uop_vstart[seg_idx]++;
       end
+    end // elm-loop
 
-      if(elm_idx * eew_max % `VLEN == `VLEN - eew_max) begin
-        if(elm_idx >= vstart && elm_idx < evl) begin
-          uop_tr.data_vreg_byte_end = (data_byte_idx-1) % `VLENB;
-          uop_tr.vidx_vreg_byte_end = (vidx_byte_idx-1) % `VLENB;
-        end else begin
-          uop_tr.data_vreg_byte_end = (data_byte_idx) % `VLENB;
-          uop_tr.vidx_vreg_byte_end = (vidx_byte_idx) % `VLENB;
-        end
-        if(inst_tr.lsu_mop inside {LSU_US, LSU_CS} && inst_tr.vm == 1) begin
-          uop_tr.lsu_slot_addr_valid = 1;
-        end
-        `uvm_info("LSU_DRV",$sformatf("Decode uop_tr to uops_rx_queque:\n%s",uop_tr.sprint()),UVM_HIGH)
-        uops_rx_queue.push_back(uop_tr);
+    uop_tr.data_vreg_byte_tail = data_byte_idx[seg_idx];
+    uop_tr.vidx_vreg_byte_tail = vidx_byte_idx[seg_idx];
+
+    if(inst_tr.lsu_mop inside {LSU_US, LSU_CS} && inst_tr.vm == 1) begin
+      uop_tr.lsu_slot_addr_valid = 1;
+    end
+    `uvm_info("LSU_DRV",$sformatf("Decode uop_tr to uops_rx_queque:\n%s",uop_tr.sprint()),UVM_HIGH)
+    uops_rx_queue.push_back(uop_tr);
+
+    `uvm_info("LSU_DRV", $sformatf("data_byte_idx[%0d] = %0d\n", seg_idx, data_byte_idx[seg_idx]), UVM_HIGH)
+
+    switch_seg = data_byte_idx[seg_idx] >= seg_switch_gap;
+    
+    data_byte_idx[seg_idx] = (data_byte_idx[seg_idx] % `VLENB);
+    vidx_byte_idx[seg_idx] = (vidx_byte_idx[seg_idx] % `VLENB);
+
+    elm_idx_head[seg_idx] += elm_per_uop;
+    elm_idx_tail[seg_idx] += elm_per_uop;
+
+    if(switch_seg) begin
+      if(seg_idx == seg_num-1) begin
+        seg_idx = 0;
+      end else begin
+        seg_idx += 1;
       end
     end
-  end
+  end // uop-loop
   `uvm_info("LSU_DRV","Decode done",UVM_HIGH)
 endfunction: lsu_uop_decode
 
+function void lsu_driver::lsu_uop_gen_delay(ref lsu_transaction uop_tr);        // Gen delay
+  case(delay_mode_rvv2lsu)
+    delay_mode_pkg::SLOW: begin
+      uop_tr.c_rvv2lsu_delay.constraint_mode(0);
+      assert(uop_tr.randomize(rvv2lsu_delay) with {
+        rvv2lsu_delay dist {
+          [1:50] :/ 20,
+          [50:100] :/ 80
+        };
+      });
+    end
+    delay_mode_pkg::NORMAL: begin
+      assert(uop_tr.randomize(rvv2lsu_delay) with {
+        rvv2lsu_delay dist {
+          [0:10] :/ 50,
+          [10:20] :/ 30,
+          [20:50] :/ 20
+        };
+      });
+    end
+    delay_mode_pkg::FAST: begin
+      assert(uop_tr.randomize(rvv2lsu_delay) with {
+        rvv2lsu_delay dist {
+          0      := 80,
+          [1:5]  :/ 15,
+          [5:20] :/ 5
+        };
+      });
+    end
+  endcase
+  case(delay_mode_lsu2rvv)
+    delay_mode_pkg::SLOW: begin
+      uop_tr.c_lsu2rvv_delay.constraint_mode(0);
+      assert(uop_tr.randomize(lsu2rvv_delay) with {
+        lsu2rvv_delay dist {
+          [1:50] :/ 20,
+          [50:100] :/ 80
+        };
+      });
+    end
+    delay_mode_pkg::NORMAL: begin
+      assert(uop_tr.randomize(lsu2rvv_delay) with {
+        lsu2rvv_delay dist {
+          [0:10] :/ 50,
+          [10:20] :/ 30,
+          [20:50] :/ 20
+        };
+      });
+    end
+    delay_mode_pkg::FAST: begin
+      assert(uop_tr.randomize(lsu2rvv_delay) with {
+        lsu2rvv_delay dist {
+          0      := 80,
+          [1:5]  :/ 15,
+          [5:20] :/ 5
+        };
+      });
+    end
+  endcase
+endfunction: lsu_uop_gen_delay
+
+function void lsu_driver::lsu_uop_gen_trap(ref lsu_transaction uop_tr);
+  // Gen trap
+  if(trap_en) begin
+    if(always_trap) begin
+      assert(uop_tr.randomize(trap_occured) with {
+        trap_occured == 1;
+      });
+    end else begin
+      assert(uop_tr.randomize(trap_occured) with {
+        trap_occured dist {
+          // 0 := 99,
+          0 := 9,
+          1 := 1
+        };
+      });
+    end
+  end else begin
+    assert(uop_tr.randomize(trap_occured) with {
+      trap_occured == 0;
+    });
+  end
+endfunction: lsu_uop_gen_trap
+
 function void lsu_driver::final_phase(uvm_phase phase);
   super.final_phase(phase);
   if(inst_queue.size()>0) begin
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_transaction.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_transaction.sv
index 783c12f..5e44515 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_transaction.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/lsu_agent_lsu_transaction.sv
@@ -25,15 +25,15 @@
   bit   data_vreg_valid;
   int   data_vreg_idx;
   eew_e data_vreg_eew;
-  int   data_vreg_byte_start;
-  int   data_vreg_byte_end;
+  int   data_vreg_byte_head;
+  int   data_vreg_byte_tail;
 
   // vs2
   bit   vidx_vreg_valid;
   int   vidx_vreg_idx;
   eew_e vidx_vreg_eew;
-  int   vidx_vreg_byte_start;
-  int   vidx_vreg_byte_end;
+  int   vidx_vreg_byte_head;
+  int   vidx_vreg_byte_tail;
 
   /* info about load/store address/data */
   bit  lsu_slot_addr_valid;
@@ -82,15 +82,15 @@
 
     `uvm_field_int(data_vreg_idx,UVM_ALL_ON)
     `uvm_field_enum(eew_e,data_vreg_eew,UVM_ALL_ON)
-    `uvm_field_int(data_vreg_byte_start,UVM_ALL_ON)
-    `uvm_field_int(data_vreg_byte_end  ,UVM_ALL_ON)
+    `uvm_field_int(data_vreg_byte_head,UVM_ALL_ON)
+    `uvm_field_int(data_vreg_byte_tail,UVM_ALL_ON)
     `uvm_field_int(data_vreg_valid,UVM_ALL_ON)
 
     if(is_indexed) begin
       `uvm_field_int(vidx_vreg_idx  ,UVM_ALL_ON)
       `uvm_field_enum(eew_e, vidx_vreg_eew,UVM_ALL_ON)
-      `uvm_field_int(vidx_vreg_byte_start,UVM_ALL_ON)
-      `uvm_field_int(vidx_vreg_byte_end  ,UVM_ALL_ON)
+      `uvm_field_int(vidx_vreg_byte_head,UVM_ALL_ON)
+      `uvm_field_int(vidx_vreg_byte_tail,UVM_ALL_ON)
       `uvm_field_int(vidx_vreg_valid,UVM_ALL_ON)
     end 
     `uvm_field_int(lsu_slot_addr_valid, UVM_ALL_ON)
@@ -125,14 +125,14 @@
   data_vreg_valid      = 0;
   data_vreg_idx        = 0;
   data_vreg_eew        = EEW_NONE;
-  data_vreg_byte_start = 0;
-  data_vreg_byte_end   = 0;
+  data_vreg_byte_head  = 0;
+  data_vreg_byte_tail  = 0;
 
   vidx_vreg_valid      = 0;
   vidx_vreg_idx        = 0;
   vidx_vreg_eew        = EEW_NONE;
-  vidx_vreg_byte_start = 0;
-  vidx_vreg_byte_end   = 0;
+  vidx_vreg_byte_head  = 0;
+  vidx_vreg_byte_tail  = 0;
 
 
   lsu_slot_addr_valid = 1'b0;
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_monitor.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_monitor.sv
index 71627c3..91c006e 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_monitor.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_monitor.sv
@@ -245,23 +245,32 @@
 
           // VRF
           if(rvs_if.rt_vrf_valid_rob2rt[rt_idx]) begin
+            int pos = 0;
             vrf_overlap = 0;
             rt_vrf_byte_strobe = rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_strobe;
             for(int i=0; i<`VLENB; i++) begin
               rt_vrf_bit_strobe[i*8 +: 8] = {8{rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_strobe[i]}}; 
             end 
             foreach(tr.rt_vrf_index[i]) begin
+              // merge same vrf
               if(tr.rt_vrf_index[i] == rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_index) begin
                 tr.rt_vrf_strobe[i] |= rt_vrf_byte_strobe;
                 tr.rt_vrf_data[i]   = rt_vrf_bit_strobe & rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_data | ~rt_vrf_bit_strobe & tr.rt_vrf_data[i];
                 vrf_overlap = 1;
                 `uvm_info(get_type_name(), $sformatf("Uops %0d also write vrf[%0d].", rt_idx, rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_index), UVM_HIGH)
               end
+              // sort vrf
+              if(tr.rt_vrf_index[i] > rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_index) begin
+                pos = i;
+                break;
+              end else begin
+                pos = i+1;
+              end
             end
             if(!vrf_overlap) begin
-              tr.rt_vrf_index.push_back(rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_index);
-              tr.rt_vrf_strobe.push_back(rt_vrf_byte_strobe);
-              tr.rt_vrf_data.push_back(rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_data);
+              tr.rt_vrf_index.insert(pos, rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_index);
+              tr.rt_vrf_strobe.insert(pos, rt_vrf_byte_strobe);
+              tr.rt_vrf_data.insert(pos, rvs_if.rt_vrf_data_rob2rt[rt_idx].rt_data);
             end
           end
 
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_transaction.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_transaction.sv
index 7286bb1..49a77a1 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_transaction.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvs_agent_rvs_transaction.sv
@@ -2728,7 +2728,7 @@
               if(this.lsu_nf == NF1) begin
                 inst = $sformatf("%se%0d", inst, lsu_eew);
               end else begin
-                inst = $sformatf("%s%0de%0d", inst, lsu_nf+1, lsu_eew);
+                inst = $sformatf("%sseg%0de%0d", inst, lsu_nf+1, lsu_eew);
               end
             end
           endcase
@@ -2737,7 +2737,7 @@
           if(this.lsu_nf == NF1) begin
             inst = $sformatf("%se%0d", inst, lsu_eew);
           end else begin
-            inst = $sformatf("%s%0de%0d", inst, lsu_nf+1, lsu_eew);
+            inst = $sformatf("%sseg%0de%0d", inst, lsu_nf+1, lsu_eew);
           end
         end
         LSU_UI, 
@@ -2745,7 +2745,7 @@
           if(this.lsu_nf == NF1) begin
             inst = $sformatf("%sei%0d", inst, lsu_eew);
           end else begin
-            inst = $sformatf("%s%0dei%0d", inst, lsu_nf+1, lsu_eew);
+            inst = $sformatf("%sseg%0dei%0d", inst, lsu_nf+1, lsu_eew);
           end
         end      
       endcase
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_behavior_model.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_behavior_model.sv
index 4337683..5c87f1a 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_behavior_model.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_behavior_model.sv
@@ -413,6 +413,7 @@
           end
             
           `uvm_info("MDL",$sformatf("Prepare done!\nelm_idx_max=%0d\ndest_eew=%0d\nsrc2_eew=%0d\nsrc1_eew=%0d\ndest_emul=%2.4f\nsrc2_emul=%2.4f\nsrc1_emul=%2.4f\n",elm_idx_max,dest_eew,src2_eew,src1_eew,dest_emul,src2_emul,src1_emul),UVM_LOW)
+          `uvm_info("MDL",$sformatf("pc = 0x%8x, v0 = 0x%16x\n", inst_tr.pc, this.vrf[0]),UVM_LOW)
 
           // 2.2 Check VRF index
           dest_reg_idx_base = inst_tr.dest_idx_base;
@@ -1773,6 +1774,7 @@
 //------------------------------------------------------------------------------
 class lsu_processor extends uvm_component;  
 
+  parameter int MAX_SEG = 8;
   `uvm_component_utils(lsu_processor)
 
   int dest_eew; real dest_emul;
@@ -1781,6 +1783,9 @@
   int src1_eew; real src1_emul;
   int src0_eew; real src0_emul;
 
+  int data_eew; real data_emul;
+  int vidx_eew; real vidx_emul;
+
   vrf_t [31:0] vrf_temp;
 
   int dest_reg_idx_base = 0;
@@ -1800,11 +1805,16 @@
   int address;
 
   int elm_idx_max;
+  int seg_idx;
   int seg_num;
   int seg_size;  // byte size
   int data_size; // byte size
   int vidx_size; // byte size
 
+  int elm_idx_head[MAX_SEG]; // elm pointer
+  int elm_idx_tail[MAX_SEG]; // elm pointer
+  int data_byte_idx[MAX_SEG];
+  int vidx_byte_idx[MAX_SEG];
   int uops_num;
   int elm_per_uop;
 
@@ -1815,12 +1825,28 @@
   endfunction: new 
   
   function void exe(rvv_behavior_model rvm, ref rvs_transaction inst_tr);
-    int uops_cnt = 0;
+    int seg_switch_gap = 0;
+    int switch_seg = 0;
+
     decode(inst_tr);
     `uvm_info("MDL/LSU", "LSU decode done", UVM_HIGH)
     `uvm_info("MDL/LSU", $sformatf("\n%s", inst_tr.sprint()), UVM_HIGH);
 
+    if(data_emul < 1) begin
+      seg_switch_gap = data_emul * `VLENB;
+    end else begin
+      seg_switch_gap = `VLENB;
+    end
+
     for(int seg_idx=0; seg_idx<seg_num; seg_idx++) begin
+      elm_idx_head[seg_idx]  = 0;
+      elm_idx_tail[seg_idx]  = elm_per_uop;
+      data_byte_idx[seg_idx] = 0;
+      vidx_byte_idx[seg_idx] = 0;
+    end
+
+    seg_idx = 0;
+    for(int uops_idx=0; uops_idx<uops_num; uops_idx++) begin
       dest_reg_idx_base = (inst_tr.dest_type == VRF) ? (inst_tr.dest_idx + seg_idx * int'($ceil(dest_emul))) : (inst_tr.dest_idx);
       src3_reg_idx_base = (inst_tr.src3_type == VRF) ? (inst_tr.src3_idx + seg_idx * int'($ceil(src3_emul))) : (inst_tr.src3_idx);
       src2_reg_idx_base = (inst_tr.src2_idx);
@@ -1829,7 +1855,13 @@
       `uvm_info("MDL/LSU", $sformatf("seg_idx=%0d: dest_reg_idx_base=%0d, src3_reg_idx_base=%0d, src2_reg_idx_base=%0d, src1_reg_idx_base=%0d",
                                   seg_idx,     dest_reg_idx_base,     src3_reg_idx_base,     src2_reg_idx_base,     src1_reg_idx_base), UVM_HIGH)
       `uvm_info("MDL/LSU", $sformatf("vreg[0]=0x%16h", rvm.vrf[0]), UVM_HIGH)
-      for(int elm_idx=0; elm_idx<elm_idx_max; elm_idx++) begin
+
+      `uvm_info("MDL/LSU", $sformatf("seg_idx           = %0d\n", seg_idx      ), UVM_HIGH)
+      `uvm_info("MDL/LSU", $sformatf("seg_switch_gap    = %0d\n", seg_switch_gap), UVM_HIGH)
+      `uvm_info("MDL/LSU", $sformatf("elm_idx_head[%0d] = %0d\n", seg_idx, elm_idx_head[seg_idx]), UVM_HIGH)
+      `uvm_info("MDL/LSU", $sformatf("elm_idx_tail[%0d] = %0d\n", seg_idx, elm_idx_tail[seg_idx]), UVM_HIGH)
+
+      for(int elm_idx=elm_idx_head[seg_idx]; elm_idx<elm_idx_tail[seg_idx]; elm_idx++) begin
         // fetch
         dest = rvm.elm_fetch(inst_tr.dest_type, dest_reg_idx_base, elm_idx, dest_eew);
         src3 = rvm.elm_fetch(inst_tr.src3_type, src3_reg_idx_base, elm_idx, src3_eew); 
@@ -1841,7 +1873,8 @@
         `uvm_info("MDL/LSU", $sformatf("dest=0x%8x, src3=0x%8x, src2=0x%8x, src1=0x%8x, src0=0x%8x", dest, src3, src2, src1, src0), UVM_HIGH);
 
         update_addr(inst_tr, seg_idx, seg_size, elm_idx, data_size, src2, src1);
-        if(rvm.trap_occured && uops_cnt<rvm.trap_occured_uop || !rvm.trap_occured) begin
+
+        if(rvm.trap_occured && uops_idx<rvm.trap_occured_uop || !rvm.trap_occured) begin
           if(elm_idx<vstart) begin
             // pre-start
             case(inst_tr.inst_type)
@@ -1884,10 +1917,28 @@
           else 
             rvm.vstart = rvm.trap_queue[0].vstart;
         end 
-        if(elm_idx%elm_per_uop == elm_per_uop-1) uops_cnt++;
+                
+        data_byte_idx[seg_idx] += data_eew/8;
+        vidx_byte_idx[seg_idx] += vidx_eew/8;
         `uvm_info("MDL/LSU", "\n---------------------------------------------------------------------------------------------------------------------------------\n", UVM_HIGH)
+      end // elm-loop
+        
+      switch_seg = data_byte_idx[seg_idx] >= seg_switch_gap;
+    
+      data_byte_idx[seg_idx] = (data_byte_idx[seg_idx] % `VLENB);
+      vidx_byte_idx[seg_idx] = (vidx_byte_idx[seg_idx] % `VLENB);
+
+      elm_idx_head[seg_idx] += elm_per_uop;
+      elm_idx_tail[seg_idx] += elm_per_uop;
+
+      if(switch_seg) begin
+        if(seg_idx == seg_num-1) begin
+          seg_idx = 0;
+        end else begin
+          seg_idx += 1;
+        end
       end
-    end // seg-loop
+    end // uops-loop
   endfunction
 
   function bit decode(ref rvs_transaction inst_tr);
@@ -1927,14 +1978,22 @@
 
     case(inst_tr.inst_type)
       LD: begin
-        seg_size = (seg_num) * dest_eew / 8;
+        seg_size  = (seg_num) * dest_eew / 8;
         data_size = dest_eew / 8;
         vidx_size = src2_eew / 8;
+        data_eew  = dest_eew;
+        vidx_eew  = src2_eew;
+        data_emul = dest_emul;
+        vidx_emul = src2_emul;
       end
       ST: begin
         seg_size = (seg_num) * src3_eew / 8;
         data_size = src3_eew / 8;
         vidx_size = src2_eew / 8;
+        data_eew  = src3_eew;
+        vidx_eew  = src2_eew;
+        data_emul = src3_emul;
+        vidx_emul = src2_emul;
       end
     endcase
     return 0;
diff --git a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_scoreboard.sv b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_scoreboard.sv
index fc83666..ea0e6a3 100644
--- a/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_scoreboard.sv
+++ b/hdl/verilog/rvv/sve/rvv_backend_tb/src/rvv_scoreboard.sv
@@ -362,6 +362,10 @@
         lsu_tr = mem_queue_lsu.pop_front();
         mdl_tr = mem_queue_mdl.pop_front();
         `uvm_info("MEM_RECORDER", $sformatf("\nMEM check start. ====================================================================================================\n"),UVM_HIGH)
+        `uvm_info("MEM_RECORDER", "lsu memory tr:", UVM_HIGH)
+        `uvm_info("MEM_RECORDER", lsu_tr.sprint(), UVM_HIGH)
+        `uvm_info("MEM_RECORDER", "mdl memory tr:", UVM_HIGH)
+        `uvm_info("MEM_RECORDER", mdl_tr.sprint(), UVM_HIGH)
         if(lsu_tr.kind != mdl_tr.kind) begin
           `uvm_error("MEM_CHCKER", $sformatf("Memory access kind mismatch: lsu = %s, mdl = %s", lsu_tr.kind.name(), mdl_tr.kind.name()))
           err++;