[spi_host/rtl] Improve handling of CPHA=1 transactions

This commit fixes several small bugs found when CPHA = 1, which delays
the assertion of SD data by 1/2 SCK cycle.  Problems exist when
the shift register receives the delayed control pulses, but
other control signals have been updated prematurely.  This
commit solves the problem by maintaining delayed copies more
required signals such as speed and byte counts.

Signed-off-by: Martin Lueker-Boden <martin.lueker-boden@wdc.com>
diff --git a/hw/ip/spi_host/rtl/spi_host_fsm.sv b/hw/ip/spi_host/rtl/spi_host_fsm.sv
index 709f7cd..7ab1754 100644
--- a/hw/ip/spi_host/rtl/spi_host_fsm.sv
+++ b/hw/ip/spi_host/rtl/spi_host_fsm.sv
@@ -41,6 +41,8 @@
   logic [15:0]      clk_cntr_q, clk_cntr_d;
   logic             clk_cntr_en;
 
+  logic [1:0]       speed_cpha0, speed_cpha1;
+
   logic [CSW-1:0]   csid;
   logic [CSW-1:0]   csid_q;
 
@@ -62,15 +64,13 @@
   logic [1:0]       cmd_speed_d, cmd_speed_q;
   logic             cmd_wr_en_d, cmd_wr_en_q;
   logic             cmd_rd_en_d, cmd_rd_en_q;
-  // cmd_len needs no data latching as it is only used at the very start of a command.
-  // A corresponding register (i.e. cmd_len_q) would not be  used and would only
-  // create a warning at synthesis
-  logic [8:0]       cmd_len;
+  logic [8:0]       cmd_len_d, cmd_len_q;
   logic             csaat;
   logic             csaat_q;
 
   logic [2:0]       bit_cntr_d, bit_cntr_q;
-  logic [8:0]       byte_cntr_d, byte_cntr_q;
+  logic [8:0]       byte_cntr_cpha0_d, byte_cntr_cpha1_d, byte_cntr_cpha0_q, byte_cntr_cpha1_q;
+  logic [8:0]       byte_cntr_early, byte_cntr_late;
   logic [3:0]       wait_cntr_d, wait_cntr_q;
   logic             last_bit, last_byte;
 
@@ -83,7 +83,11 @@
 
   logic             config_changed;
   logic             fsm_en;
-  logic             new_command;
+
+  // new_command: signals a new segment input
+  // new_command_cpha1: delayed copy for updating the byte_cntr (do not use the delayed copy for
+  //                    updating the FSM state)
+  logic             new_command, new_command_cpha1;
 
   logic             csb_single_d;
   logic [NumCS-1:0] csb_q;
@@ -137,9 +141,7 @@
     csntrail  = new_command ? command_i.configopts.csntrail : csntrail_q;
     clkdiv    = new_command ? command_i.configopts.clkdiv : clkdiv_q;
     csaat     = new_command ? command_i.segment.csaat : csaat_q;
-    // cmd_len needs no data latching as it is only used at the very start of a command
-    // to initialize byte_cntr_q
-    cmd_len     = command_i.segment.len;
+    cmd_len_d   = new_command ? command_i.segment.len : cmd_len_q;
     cmd_wr_en_d = new_command ? command_i.segment.cmd_wr_en : cmd_wr_en_q;
     cmd_rd_en_d = new_command ? command_i.segment.cmd_rd_en : cmd_rd_en_q;
     cmd_speed_d = new_command ? command_i.segment.speed : cmd_speed_q;
@@ -159,6 +161,7 @@
       cmd_rd_en_q <= 1'b0;
       cmd_wr_en_q <= 1'b0;
       cmd_speed_q <= 2'b00;
+      cmd_len_q   <= 9'h0;
     end else begin
       csid_q      <= (new_command && !stall) ? csid : csid_q;
       cpol_q      <= (new_command && !stall) ? cpol : cpol_q;
@@ -172,6 +175,7 @@
       cmd_wr_en_q <= (new_command && !stall) ? cmd_wr_en_d : cmd_wr_en_q;
       cmd_rd_en_q <= (new_command && !stall) ? cmd_rd_en_d : cmd_rd_en_q;
       cmd_speed_q <= (new_command && !stall) ? cmd_speed_d : cmd_speed_q;
+      cmd_len_q   <= (new_command && !stall) ? cmd_len_d : cmd_len_q;
     end
   end
 
@@ -319,6 +323,8 @@
     end
   end
 
+  logic segment_rd_en, segment_rd_en_cpha0, segment_rd_en_cpha1;
+
   assign state_changing = (state_q != state_d);
   assign byte_starting_cpha0 = ~sw_rst_i & state_changing &
                                ((state_d == WaitLead) |
@@ -328,6 +334,9 @@
   assign byte_ending_cpha0   = ~sw_rst_i & state_changing &
                                (state_q == InternalClkHigh & bit_cntr_q == 0);
 
+  assign speed_cpha0         = cmd_speed_q;
+  assign segment_rd_en_cpha0 = cmd_rd_en_q;
+
   // We can calculate byte transitions for CHPA=1 by noting
   // that in this implmentation, the sck edges have a 1-1
   // correspondence with FSM transitions.
@@ -338,14 +347,20 @@
       byte_starting_cpha0_q <= 1'b0;
       byte_ending_cpha0_q   <= 1'b0;
       bit_shifting_cpha0_q  <= 1'b0;
+      speed_cpha1           <= Standard;
+      segment_rd_en_cpha1   <= 1'b0;
+      new_command_cpha1     <= 1'b0;
     end else if (state_changing && !stall) begin
       byte_ending_cpha0_q   <= byte_ending_cpha0;
       byte_starting_cpha0_q <= byte_starting_cpha0;
       bit_shifting_cpha0_q  <= bit_shifting_cpha0;
+      speed_cpha1           <= speed_cpha0;
+      segment_rd_en_cpha1   <= segment_rd_en_cpha0;
+      new_command_cpha1     <= new_command;
     end
   end
 
-  // The <XYZ>_cpha0_q registers queue up a delayed <XYZ> pulse for use
+  // The <XYZ>_cpha0_q pulse registers queue up a delayed <XYZ> pulse for use
   // in CPHA=1 mode. Here we also have to ensure that the resulting
   // pulse is only one cycle long.
   assign byte_starting_cpha1 = byte_starting_cpha0_q & state_changing;
@@ -361,6 +376,17 @@
   assign bit_shifting  = (cpha == 1'b0) ? bit_shifting_cpha0 :
                                           bit_shifting_cpha1;
 
+  assign speed_o       = (cpha == 1'b0) ? speed_cpha0:
+                                          speed_cpha1;
+
+  assign segment_rd_en = (cpha == 1'b0) ? segment_rd_en_cpha0:
+                                          segment_rd_en_cpha1;
+
+  assign byte_cntr_early = (cpha == 1'b0) ? byte_cntr_cpha0_d :
+                                            byte_cntr_cpha1_d;
+  assign byte_cntr_late  = (cpha == 1'b0) ? byte_cntr_cpha0_q :
+                                            byte_cntr_cpha1_q;
+
   logic [2:0] shift_size;
   logic [2:0] start_bit;
 
@@ -403,13 +429,30 @@
                       bit_cntr_q;
 
   assign last_bit  = (bit_cntr_q == 3'h0);
-  assign last_byte = (byte_cntr_q == 9'h0);
+  //
+  // The variable last_byte is only used for updating the FSM state.
+  // For CPHA=1 operation, either byte_cntr_cpha0_q or byte_cntr_cpha1_q
+  // can drive the FSM properly.  However, we explicitly choose
+  // byte_cntr_cpha0_q to avoid a combinational logic loop.
+  //
+  assign last_byte = (byte_cntr_cpha0_q == 9'h0);
 
-  assign byte_cntr_d = sw_rst_i    ? 9'h0 :
-                       !fsm_en     ? byte_cntr_q :
-                       new_command ? cmd_len :
-                       byte_ending ? byte_cntr_q - 1 :
-                       byte_cntr_q;
+  // Note: when updating the byte_cntr in CPHA=0 mode with a new command value, the length must
+  // be pulled in directly from the command bus, cmd_len_d;
+  assign byte_cntr_cpha0_d = sw_rst_i    ? 9'h0 :
+                             !fsm_en     ? byte_cntr_cpha0_q :
+                             new_command ? cmd_len_d :
+                             byte_ending_cpha0 ? byte_cntr_cpha0_q - 1 :
+                             byte_cntr_cpha0_q;
+
+  // Note: when updating the byte_cntr in CPHA=1 mode with a new command value, the length must
+  // be pulled in with a single state delay (using new_command_cpha1) and must use the
+  // registered value, cmd_len_q;
+  assign byte_cntr_cpha1_d = sw_rst_i          ? 9'h0 :
+                             !fsm_en           ? byte_cntr_cpha1_q :
+                             new_command_cpha1 ? cmd_len_q :
+                             byte_ending_cpha1 ? byte_cntr_cpha1_q - 1 :
+                             byte_cntr_cpha1_q;
 
   always_comb begin
     if(sw_rst_i) begin
@@ -445,34 +488,27 @@
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      bit_cntr_q   <= 3'h0;
-      byte_cntr_q  <= 9'h0;
-      wait_cntr_q  <= 4'h0;
+      bit_cntr_q        <= 3'h0;
+      byte_cntr_cpha0_q <= 9'h0;
+      byte_cntr_cpha1_q <= 9'h0;
+      wait_cntr_q       <= 4'h0;
     end else begin
-      bit_cntr_q   <= stall ? bit_cntr_q   : bit_cntr_d;
-      byte_cntr_q  <= stall ? byte_cntr_q  : byte_cntr_d;
-      wait_cntr_q  <= stall ? wait_cntr_q  : wait_cntr_d;
+      bit_cntr_q        <= stall ? bit_cntr_q        : bit_cntr_d;
+      byte_cntr_cpha0_q <= stall ? byte_cntr_cpha0_q : byte_cntr_cpha0_d;
+      byte_cntr_cpha1_q <= stall ? byte_cntr_cpha1_q : byte_cntr_cpha1_d;
+      wait_cntr_q       <= stall ? wait_cntr_q       : wait_cntr_d;
     end
   end
 
   assign wr_en_internal    = byte_starting & cmd_wr_en_d;
   assign shift_en_internal = bit_shifting;
 
-  assign rd_en_internal    = byte_ending & cmd_rd_en_q;
-  assign speed_o           = cmd_speed_q;
+  assign rd_en_internal    = byte_ending & segment_rd_en;
   assign sample_en_d       = byte_starting | shift_en_o;
   assign full_cyc_o        = full_cyc;
-  assign last_read_o       = (byte_cntr_q == 'h0) & rd_en_o & sr_rd_ready_i;
+  assign last_read_o       = (byte_cntr_late == 'h0) & rd_en_o & sr_rd_ready_i;
 
-  // Timing the last_write pulse for back-to-back transactions is somewhat subtle
-  // for CPHA=0 we look at the _early_ byte_cntr_d, to ensure that we properly
-  // handle even 1 byte transactions (which perform a write to the SR at the very
-  // beginning of the segment.  However for CPHA=1, the write pulses
-  // are all later, and so we need to look at the later signal byte_cntr_q.
-  logic  [8:0] last_wrt_cntr;
-  assign last_wrt_cntr     = cpha ? byte_cntr_q : byte_cntr_d;
-
-  assign last_write_o      = (last_wrt_cntr == 'h0) & wr_en_o & sr_wr_ready_i;
+  assign last_write_o      = (byte_cntr_early == 'h0) & wr_en_o & sr_wr_ready_i;
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
@@ -526,7 +562,7 @@
     if (&csb_o) begin
       sd_en_o[3:0] = 4'h0;
     end else begin
-      unique case (cmd_speed_q)
+      unique case (speed_o)
         Standard: begin
           sd_en_o[0]   = 1'b1;
           sd_en_o[1]   = 1'b0;