[ spi_host rtl ] Prepare to move entire SPI_HOST IP the peripheral clock domain

This is the first of two commits which aim to standardize the CDC strategy for SPI_HOST

In this first commit:

- An explicit tlul_async_fifo is created inside spi_host.sv. This FIFO will eventually move
  to the TLUL fabric or the register interface.  However putting it inside the IP for now
  allows for testing of changes to other blocks without changing the interface, which would
  disrupt DV testing.

- The command_cdc has been removed in favor of a shallow synchronous FIFO in
  spi_host_command_queue.

- spi_host_data_cdc with its async fifos have also been removed in favor
  of spi_host_data_fifos, in which the fifos are synchronous

Pending until next commit:

- Migration of TLUL fifo to some automated system (TLUL fabric? Regtool?)
- Removal of "Core" clocks & resets
- Change COMMAND register to hw_ext
- DV Interface updates to change clocks

Signed-off-by: Martin Lueker-Boden <martin.lueker-boden@wdc.com>
diff --git a/hw/ip/spi_host/rtl/spi_host.sv b/hw/ip/spi_host/rtl/spi_host.sv
index 13aea5c..3b796d9 100644
--- a/hw/ip/spi_host/rtl/spi_host.sv
+++ b/hw/ip/spi_host/rtl/spi_host.sv
@@ -45,6 +45,33 @@
 
   import spi_host_cmd_pkg::*;
 
+  // TODO: Make this an actual parameter
+  localparam int CmdDepth = 4;
+
+
+  // TODO:
+  // This CDC FIFO is the first step in moving the entire IP
+  // into the "core" clock domain. As such this is temporary.
+  // Once the IP has been confirmed to work with this scheme
+  // we can work make changes to top-level and DV to
+  // move this FIFO outside this IP and top_earlgrey
+  tlul_pkg::tl_h2d_t tl_core_in;
+  tlul_pkg::tl_d2h_t tl_core_out;
+
+  tlul_fifo_async #(
+    .ReqDepth(4),
+    .RspDepth(4)
+  ) cdc (
+    .clk_h_i  ( clk_i       ),
+    .rst_h_ni ( rst_ni      ),
+    .clk_d_i  ( clk_core_i  ),
+    .rst_d_ni ( rst_core_ni ),
+    .tl_h_i   ( tl_i        ),
+    .tl_h_o   ( tl_o        ),
+    .tl_d_o   ( tl_core_in  ),
+    .tl_d_i   ( tl_core_out )
+  );
+
   spi_host_reg2hw_t reg2hw;
   spi_host_hw2reg_t hw2reg;
 
@@ -54,10 +81,10 @@
   // Register module
   logic [NumAlerts-1:0] alert_test, alerts;
   spi_host_reg_top u_reg (
-    .clk_i,
-    .rst_ni,
-    .tl_i       (tl_i),
-    .tl_o       (tl_o),
+    .clk_i      (clk_core_i),
+    .rst_ni     (rst_core_ni),
+    .tl_i       (tl_core_in),
+    .tl_o       (tl_core_out),
     .tl_win_o   (fifo_win_h2d),
     .tl_win_i   (fifo_win_d2h),
     .reg2hw,
@@ -77,8 +104,8 @@
       .AsyncOn(AlertAsyncOn[i]),
       .IsFatal(1'b1)
     ) u_prim_alert_sender (
-      .clk_i,
-      .rst_ni,
+      .clk_i         ( clk_core_i    ),
+      .rst_ni        ( rst_core_ni   ),
       .alert_test_i  ( alert_test[i] ),
       .alert_req_i   ( alerts[0]     ),
       .alert_ack_o   (               ),
@@ -122,7 +149,7 @@
   end                   : gen_passthrough_implementation
   else begin            : gen_passthrough_ignore
      // Passthrough only supported for instances with one CSb line
-    `ASSERT(PassthroughNumCSCompat_A, !passthrough_i.passthrough_en, clk_i, rst_ni)
+    `ASSERT(PassthroughNumCSCompat_A, !passthrough_i.passthrough_en, clk_core_i, rst_core_ni)
 
     assign cio_sck_o    = sck;
     assign cio_sck_en_o = 1'b1;
@@ -256,11 +283,11 @@
   // some cases, the writes to COMMAND are not atomic.
   //
   // Disabling this assertion for now
-  //`ASSERT(CmdAtomicity_A, &cmd_qes ^ |cmd_qes, clk_i, rst_ni);
+  //`ASSERT(CmdAtomicity_A, &cmd_qes ^ |cmd_qes, clk_core_i, rst_core_ni);
 
-  logic active, core_active;
-  logic rx_stall, core_rx_stall;
-  logic tx_stall, core_tx_stall;
+  logic active;
+  logic rx_stall;
+  logic tx_stall;
 
   assign hw2reg.status.ready.d    = ~command_busy;
   assign hw2reg.status.active.d   = active;
@@ -272,13 +299,13 @@
   assign hw2reg.status.rxstall.de = 1'b1;
   assign hw2reg.status.txstall.de = 1'b1;
 
-  logic sw_rst, core_sw_rst;
+  logic sw_rst;
 
-  spi_host_command_cdc u_cmd_cdc (
-    .clk_i,
-    .rst_ni,
-    .clk_core_i,
-    .rst_core_ni,
+  spi_host_command_queue #(
+    .CmdDepth(CmdDepth)
+  ) u_cmd_queue (
+    .clk_i                (clk_core_i),
+    .rst_ni               (rst_core_ni),
     .command_i            (command),
     .command_valid_i      (command_valid),
     .command_busy_o       (command_busy),
@@ -286,9 +313,7 @@
     .core_command_valid_o (core_command_valid),
     .core_command_ready_i (core_command_ready),
     .error_busy_o         (error_busy),
-
-    .sw_rst_i             (sw_rst),
-    .core_sw_rst_i        (core_sw_rst)
+    .sw_rst_i             (sw_rst)
   );
 
   logic [31:0] tx_data;
@@ -301,8 +326,8 @@
   logic        rx_ready;
 
   spi_host_window u_window (
-    .clk_i,
-    .rst_ni,
+    .clk_i      (clk_core_i),
+    .rst_ni     (rst_core_ni),
     .win_i      (fifo_win_h2d),
     .win_o      (fifo_win_d2h),
     .tx_data_o  (tx_data),
@@ -352,25 +377,22 @@
 
   logic error_overflow, error_underflow;
 
-  // Since the CDC FIFOs are essentially directly connected to SW registers, it is an error if
+  // Since the DATA FIFOs are essentially directly connected to SW registers, it is an error if
   // there is ever a need for flow control.
   assign error_overflow  = tx_valid & ~tx_ready;
   assign error_underflow = rx_ready & ~rx_valid;
 
-
   // Note on ByteOrder and ByteSwapping.
-  // ByteOrder == 1 is for Little-Endian transmission (i.e. LSB first), which is acheived by default
-  // with the prim_packer_fifo implementation.  Thus we have to swap if Big-Endian transmission
-  // is required (i.e. if ByteOrder == 0).
-  spi_host_data_cdc #(
+  // ByteOrder == 1 is for Little-Endian transmission (i.e. LSB first), which is acheived by
+  // default with the prim_packer_fifo implementation.  Thus we have to swap if Big-Endian
+  // transmission is required (i.e. if ByteOrder == 0).
+  spi_host_data_fifos #(
     .TxDepth(TxDepth),
     .RxDepth(RxDepth),
     .SwapBytes(~ByteOrder)
-  ) u_data_cdc (
-    .clk_i,
-    .rst_ni,
-    .clk_core_i,
-    .rst_core_ni,
+  ) u_data_fifos (
+    .clk_i             (clk_core_i),
+    .rst_ni            (rst_core_ni),
 
     .tx_data_i         (tx_data),
     .tx_be_i           (tx_be),
@@ -401,36 +423,16 @@
     .rx_qd_o           (rx_qd),
     .rx_wm_o           (rx_wm),
 
-    .sw_rst_i          (sw_rst),
-    .core_sw_rst_i     (core_sw_rst)
+    .sw_rst_i          (sw_rst)
 );
 
-  // CDCs for a handful of continuous or pulsed control and status signals
   logic en_sw;
   logic enb_error;
-  logic en, core_en;
+  logic en;
 
-  assign en         = en_sw & ~enb_error;
-  assign sw_rst     = reg2hw.control.sw_rst.q;
-  assign en_sw      = reg2hw.control.spien.q;
-
-  prim_flop_2sync #(
-    .Width(3)
-  ) u_sync_stat_from_core (
-    .clk_i,
-    .rst_ni,
-    .d_i      ({core_rx_stall, core_tx_stall, core_active}),
-    .q_o      ({     rx_stall,      tx_stall,      active})
-  );
-
-  prim_flop_2sync #(
-    .Width(2)
-  ) u_sync_en_to_core (
-    .clk_i    (clk_core_i),
-    .rst_ni   (rst_core_ni),
-    .d_i      ({en,      sw_rst}),
-    .q_o      ({core_en, core_sw_rst})
-  );
+  assign en     = en_sw & ~enb_error;
+  assign sw_rst = reg2hw.control.sw_rst.q;
+  assign en_sw  = reg2hw.control.spien.q;
 
   spi_host_core #(
     .NumCS(NumCS)
@@ -441,7 +443,7 @@
     .command_i       (core_command),
     .command_valid_i (core_command_valid),
     .command_ready_o (core_command_ready),
-    .en_i            (core_en),
+    .en_i            (en),
     .tx_data_i       (core_tx_data),
     .tx_be_i         (core_tx_be),
     .tx_valid_i      (core_tx_valid),
@@ -454,10 +456,10 @@
     .sd_o            (sd_out),
     .sd_en_o         (sd_en),
     .sd_i,
-    .rx_stall_o      (core_rx_stall),
-    .tx_stall_o      (core_tx_stall),
-    .sw_rst_i        (core_sw_rst),
-    .active_o        (core_active)
+    .rx_stall_o      (rx_stall),
+    .tx_stall_o      (tx_stall),
+    .sw_rst_i        (sw_rst),
+    .active_o        (active)
   );
 
   logic event_error;
@@ -504,8 +506,8 @@
   assign enb_error     = |sw_error_status;
 
   prim_intr_hw #(.Width(1)) intr_hw_error (
-    .clk_i,
-    .rst_ni,
+    .clk_i                  (clk_core_i),
+    .rst_ni                 (rst_core_ni),
     .event_intr_i           (event_error),
     .reg2hw_intr_enable_q_i (reg2hw.intr_enable.error.q),
     .reg2hw_intr_test_q_i   (reg2hw.intr_test.error.q),
@@ -557,8 +559,8 @@
   assign event_rx_full  = rx_full_d & ~rx_full_q;
   assign rx_full_d      = rx_full;
 
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
+  always_ff @(posedge clk_core_i or negedge rst_core_ni) begin
+    if (!rst_core_ni) begin
       idle_q     <= 1'b0;
       ready_q    <= 1'b0;
       tx_wm_q    <= 1'b0;
@@ -576,8 +578,8 @@
   end
 
   prim_intr_hw #(.Width(1)) intr_hw_spi_event (
-    .clk_i,
-    .rst_ni,
+    .clk_i                  (clk_core_i),
+    .rst_ni                 (rst_core_ni),
     .event_intr_i           (event_spi_event),
     .reg2hw_intr_enable_q_i (reg2hw.intr_enable.spi_event.q),
     .reg2hw_intr_test_q_i   (reg2hw.intr_test.spi_event.q),
diff --git a/hw/ip/spi_host/rtl/spi_host_command_cdc.sv b/hw/ip/spi_host/rtl/spi_host_command_cdc.sv
deleted file mode 100644
index 5062ea2..0000000
--- a/hw/ip/spi_host/rtl/spi_host_command_cdc.sv
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright lowRISC contributors.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-//
-// CDC module for SPI_HOST commands
-//
-
-module spi_host_command_cdc (
-  input                              clk_i,
-  input                              rst_ni,
-
-  input                              clk_core_i,
-  input                              rst_core_ni,
-
-  input  spi_host_cmd_pkg::command_t command_i,
-  input                              command_valid_i,
-  output logic                       command_busy_o,
-
-  output spi_host_cmd_pkg::command_t core_command_o,
-  output logic                       core_command_valid_o,
-  input                              core_command_ready_i,
-
-  output logic                       error_busy_o,
-
-  input                              sw_rst_i,
-  input                              core_sw_rst_i
-);
-
-  assign error_busy_o = command_valid_i & command_busy_o;
-
-  logic                                 command_ack;
-  logic [spi_host_cmd_pkg::CmdSize-1:0] command_q;
-  logic [spi_host_cmd_pkg::CmdSize-1:0] command_d;
-
-  assign command_ack = command_valid_i & ~command_busy_o & ~sw_rst_i;
-
-  // command_d serves as the input to both the command latch register and to the CDC, to ensure that
-  // the CDC input stays stable for an extra cycle before command_valid_i.
-  assign command_d   = command_ack ? command_i : command_q;
-
-  logic                                 cdc_req_q;
-  logic                                 cdc_req_d;
-  logic                                 cdc_ack;
-
-  assign cdc_req_d = command_ack ? 1'b1 :
-                     cdc_ack     ? 1'b0 :
-                     cdc_req_q;
-
-  assign command_busy_o = cdc_req_q;
-
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      command_q <= {spi_host_cmd_pkg::CmdSize{1'b0}};
-      cdc_req_q <= 1'b0;
-    end else begin
-      cdc_req_q <= cdc_req_d;
-      command_q <= command_d;
-    end
-  end
-
-  // When sw_rst is established, let the handshake end passively by always acknowledging when valid
-  logic core_cdc_req;
-  logic core_cdc_ack;
-  assign core_cdc_ack = (core_command_ready_i | sw_rst_i) & core_cdc_req;
-
-  // drop requests when sw_rst_i is active
-  assign core_command_valid_o = core_cdc_req & ~core_sw_rst_i;
-
-  prim_sync_reqack_data #(
-    .Width(spi_host_cmd_pkg::CmdSize),
-    .DataSrc2Dst(1'b1)
-  ) u_sync_reqack (
-    .clk_src_i  (clk_i),
-    .rst_src_ni (rst_ni),
-    .clk_dst_i  (clk_core_i),
-    .rst_dst_ni (rst_core_ni),
-    .req_chk_i  (1'b1),
-    .src_req_i  (cdc_req_q),
-    .src_ack_o  (cdc_ack),
-    .dst_req_o  (core_cdc_req),
-    .dst_ack_i  (core_cdc_ack),
-    .data_i     (command_d),
-    .data_o     (core_command_o)
-  );
-
-endmodule : spi_host_command_cdc
diff --git a/hw/ip/spi_host/rtl/spi_host_command_queue.sv b/hw/ip/spi_host/rtl/spi_host_command_queue.sv
new file mode 100644
index 0000000..6244c15
--- /dev/null
+++ b/hw/ip/spi_host/rtl/spi_host_command_queue.sv
@@ -0,0 +1,50 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Queue for SPI_HOST commands
+//
+
+module spi_host_command_queue #(
+  parameter int CmdDepth = 4
+) (
+  input                              clk_i,
+  input                              rst_ni,
+
+  input  spi_host_cmd_pkg::command_t command_i,
+  input                              command_valid_i,
+  output logic                       command_busy_o,
+
+  output spi_host_cmd_pkg::command_t core_command_o,
+  output logic                       core_command_valid_o,
+  input                              core_command_ready_i,
+
+  output logic                       error_busy_o,
+
+  input                              sw_rst_i
+);
+
+  logic command_ready;
+
+  assign command_busy_o = ~command_ready;
+  assign error_busy_o   = command_valid_i & command_busy_o;
+
+  prim_fifo_sync #(
+    .Width(spi_host_cmd_pkg::CmdSize),
+    .Pass(1),
+    .Depth(CmdDepth)
+  ) cmd_fifo (
+    .clk_i,
+    .rst_ni,
+    .clr_i    (sw_rst_i),
+    .wvalid_i (command_valid_i),
+    .wready_o (command_ready),
+    .wdata_i  (command_i),
+    .rvalid_o (core_command_valid_o),
+    .rready_i (core_command_ready_i),
+    .rdata_o  (core_command_o),
+    .full_o   (),
+    .depth_o  ()
+  );
+
+endmodule : spi_host_command_queue
diff --git a/hw/ip/spi_host/rtl/spi_host_data_cdc.sv b/hw/ip/spi_host/rtl/spi_host_data_cdc.sv
deleted file mode 100644
index 3a48ea5..0000000
--- a/hw/ip/spi_host/rtl/spi_host_data_cdc.sv
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright lowRISC contributors.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-//
-// CDC module for SPI_HOST RX and TX data
-//
-
-module spi_host_data_cdc #(
-  parameter int         TxDepth   = 72,
-  parameter int         RxDepth   = 64,
-  parameter logic       SwapBytes = 0
-) (
-  input               clk_i,
-  input               rst_ni,
-  input               clk_core_i,
-  input               rst_core_ni,
-
-  input        [31:0] tx_data_i,
-  input        [3:0]  tx_be_i,
-  input               tx_valid_i,
-  output logic        tx_ready_o,
-  input        [7:0]  tx_watermark_i,
-
-  output logic [31:0] core_tx_data_o,
-  output logic [3:0]  core_tx_be_o,
-  output logic        core_tx_valid_o,
-  input               core_tx_ready_i,
-
-  input        [31:0] core_rx_data_i,
-  input               core_rx_valid_i,
-  output logic        core_rx_ready_o,
-
-  output logic [31:0] rx_data_o,
-  output logic        rx_valid_o,
-  input               rx_ready_i,
-  input        [7:0]  rx_watermark_i,
-
-  input               sw_rst_i,
-  input               core_sw_rst_i,
-
-  output logic        tx_empty_o,
-  output logic        tx_full_o,
-  output logic [7:0]  tx_qd_o,
-  output logic        tx_wm_o,
-  output logic        rx_empty_o,
-  output logic        rx_full_o,
-  output logic [7:0]  rx_qd_o,
-  output logic        rx_wm_o
-);
-
-  localparam int TxDepthW      = $clog2(TxDepth);
-  localparam int RxDepthW      = $clog2(RxDepth);
-
-  // As async FIFOs must have a power-of-two depth,
-  // requests for non-power-of-two data queues
-  // generate a second, synchronous FIFO to hold
-  // extra data words.
-  //
-  // The second synchronous FIFO is placed on the bus side
-  // of the CDC for ease of TL-UL monitoring.
-  //
-  localparam logic TxAsyncOnlyFifo = (TxDepth == 2**TxDepthW);
-  localparam logic RxAsyncOnlyFifo = (RxDepth == 2**RxDepthW);
-
-  localparam int TxAsyncDepth      = TxAsyncOnlyFifo ? 2**TxDepthW : 2**(TxDepthW - 1);
-  localparam int RxAsyncDepth      = RxAsyncOnlyFifo ? 2**RxDepthW : 2**(RxDepthW - 1);
-
-  localparam int TxSyncDepth       = TxDepth - TxAsyncDepth;
-  localparam int RxSyncDepth       = RxDepth - RxAsyncDepth;
-
-  localparam int TxAsyncDepthW     = prim_util_pkg::vbits(TxAsyncDepth+1);
-  localparam int RxAsyncDepthW     = prim_util_pkg::vbits(RxAsyncDepth+1);
-
-  localparam int TxSyncDepthW      = prim_util_pkg::vbits(TxSyncDepth+1);
-  localparam int RxSyncDepthW      = prim_util_pkg::vbits(RxSyncDepth+1);
-
-  logic [31:0] tx_data_ordered;
-  logic [3:0]  tx_be_ordered;
-  logic [31:0] rx_data_unordered;
-
-  if (SwapBytes) begin : gen_swap
-    assign tx_data_ordered = { << 8 {tx_data_i} };
-    assign tx_be_ordered   = { << { tx_be_i} };
-    assign rx_data_o       = { << 8 { rx_data_unordered } };
-  end else begin : gen_do_not_swap
-    assign tx_data_ordered = tx_data_i;
-    assign tx_be_ordered   = tx_be_i;
-    assign rx_data_o       = rx_data_unordered;
-  end : gen_do_not_swap
-
-  logic [35:0] tx_data_be;
-  logic [35:0] core_tx_data_be;
-
-  assign tx_data_be = { tx_data_ordered, tx_be_ordered };
-  assign { core_tx_data_o, core_tx_be_o } = core_tx_data_be;
-
-  // I/O connections to async fifos
-  logic [35:0]              tx_data_be_async_fifo;
-  logic                     tx_valid_async_fifo;
-  logic                     tx_ready_async_fifo;
-
-  logic [31:0]              rx_data_async_fifo;
-  logic                     rx_valid_async_fifo;
-  logic                     rx_ready_async_fifo;
-
-  logic [TxAsyncDepthW-1:0] tx_depth_async_fifo;
-  logic [RxAsyncDepthW-1:0] rx_depth_async_fifo;
-  logic [7:0]               tx_depth_total;
-  logic [7:0]               rx_depth_total;
-
-  assign tx_qd_o = tx_depth_total;
-  assign rx_qd_o = rx_depth_total;
-
-  if (TxSyncDepth == 0) begin : gen_tx_async_only
-
-    // TODO:  Ignore zero byte writes
-    assign tx_data_be_async_fifo = tx_data_be;
-    assign tx_valid_async_fifo   = tx_valid_i;
-    assign tx_ready_o            = tx_ready_async_fifo;
-    assign tx_depth_total        = 8'(tx_depth_async_fifo);
-
-  end else begin : gen_tx_async_plus_sync
-
-    logic [TxSyncDepthW-1:0] tx_depth_sync_fifo;
-    assign tx_depth_total = 8'(tx_depth_async_fifo) + 8'(tx_depth_sync_fifo);
-
-    prim_fifo_sync #(
-      .Width(36),
-      .Pass(1),
-      .Depth(TxSyncDepth)
-    ) u_tx_sync_fifo (
-      .clk_i,
-      .rst_ni,
-      .clr_i    (sw_rst_i),
-      .wvalid_i (tx_valid_i),
-      .wready_o (tx_ready_o),
-      .wdata_i  (tx_data_be),
-      .rvalid_o (tx_valid_async_fifo),
-      .rready_i (tx_ready_async_fifo | core_sw_rst_i),
-      .rdata_o  (tx_data_be_async_fifo),
-      .full_o   (),
-      .depth_o  (tx_depth_sync_fifo)
-    );
-
-  end : gen_tx_async_plus_sync
-
-  // TODO: Establish better sw_rst technique
-  // Given the lack of external clear sw_rst just drains the fifo over ~64 clocks
-
-  prim_fifo_async #(
-    .Width(36),
-    .Depth(TxAsyncDepth)
-  ) u_tx_async_fifo (
-    .clk_wr_i  (clk_i),
-    .rst_wr_ni (rst_ni),
-    .clk_rd_i  (clk_core_i),
-    .rst_rd_ni (rst_core_ni),
-    .wdata_i   (tx_data_be_async_fifo),
-    .wvalid_i  (tx_valid_async_fifo),
-    .wready_o  (tx_ready_async_fifo),
-    .wdepth_o  (tx_depth_async_fifo),
-    .rdata_o   (core_tx_data_be),
-    .rvalid_o  (core_tx_valid_o),
-    .rready_i  (core_tx_ready_i),
-    .rdepth_o  ()
-  );
-
-  prim_fifo_async #(
-    .Width(32),
-    .Depth(RxAsyncDepth)
-  ) u_rx_async_fifo (
-    .clk_wr_i  (clk_core_i),
-    .rst_wr_ni (rst_core_ni),
-    .clk_rd_i  (clk_i),
-    .rst_rd_ni (rst_ni),
-    .wdata_i   (core_rx_data_i),
-    .wvalid_i  (core_rx_valid_i),
-    .wready_o  (core_rx_ready_o),
-    .wdepth_o  (),
-    .rdata_o   (rx_data_async_fifo),
-    .rvalid_o  (rx_valid_async_fifo),
-    .rready_i  (rx_ready_async_fifo | sw_rst_i),
-    .rdepth_o  (rx_depth_async_fifo)
-  );
-
-  if (RxSyncDepth == 0) begin : gen_rx_async_only
-
-    assign rx_data_unordered   = rx_data_async_fifo;
-    assign rx_valid_o          = rx_valid_async_fifo;
-    assign rx_ready_async_fifo = rx_ready_i;
-    assign rx_depth_total      = 8'(rx_depth_async_fifo);
-
-  end else begin : gen_rx_async_plus_sync
-
-    logic [RxSyncDepthW-1:0] rx_depth_sync_fifo;
-    assign rx_depth_total = 8'(rx_depth_async_fifo) + 8'(rx_depth_sync_fifo);
-
-    prim_fifo_sync #(
-      .Width(32),
-      .Pass(1),
-      .Depth(TxSyncDepth)
-    ) u_rx_sync_fifo (
-      .clk_i,
-      .rst_ni,
-      .clr_i    (sw_rst_i),
-      .wvalid_i (rx_valid_async_fifo),
-      .wready_o (rx_ready_async_fifo),
-      .wdata_i  (rx_data_async_fifo),
-      .rvalid_o (rx_valid_o),
-      .rready_i (rx_ready_i),
-      .rdata_o  (rx_data_unordered),
-      .full_o   (),
-      .depth_o  (rx_depth_sync_fifo)
-    );
-
-  end : gen_rx_async_plus_sync
-
-  assign tx_empty_o = (tx_qd_o == 0);
-  assign rx_empty_o = (rx_qd_o == 0);
-  assign tx_full_o  = (tx_qd_o >= 8'(TxDepth));
-  assign rx_full_o  = (rx_qd_o >= 8'(RxDepth));
-  assign tx_wm_o    = (tx_qd_o >= tx_watermark_i);
-  assign rx_wm_o    = (rx_qd_o >= rx_watermark_i);
-
-endmodule : spi_host_data_cdc
diff --git a/hw/ip/spi_host/rtl/spi_host_data_fifos.sv b/hw/ip/spi_host/rtl/spi_host_data_fifos.sv
new file mode 100644
index 0000000..cf38cfa
--- /dev/null
+++ b/hw/ip/spi_host/rtl/spi_host_data_fifos.sv
@@ -0,0 +1,124 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Module for SPI_HOST RX and TX queues
+//
+
+module spi_host_data_fifos #(
+  parameter int         TxDepth   = 72,
+  parameter int         RxDepth   = 64,
+  parameter logic       SwapBytes = 0
+) (
+  input               clk_i,
+  input               rst_ni,
+
+  input        [31:0] tx_data_i,
+  input        [3:0]  tx_be_i,
+  input               tx_valid_i,
+  output logic        tx_ready_o,
+  input        [7:0]  tx_watermark_i,
+
+  output logic [31:0] core_tx_data_o,
+  output logic [3:0]  core_tx_be_o,
+  output logic        core_tx_valid_o,
+  input               core_tx_ready_i,
+
+  input        [31:0] core_rx_data_i,
+  input               core_rx_valid_i,
+  output logic        core_rx_ready_o,
+
+  output logic [31:0] rx_data_o,
+  output logic        rx_valid_o,
+  input               rx_ready_i,
+  input        [7:0]  rx_watermark_i,
+
+  input               sw_rst_i,
+
+  output logic        tx_empty_o,
+  output logic        tx_full_o,
+  output logic [7:0]  tx_qd_o,
+  output logic        tx_wm_o,
+  output logic        rx_empty_o,
+  output logic        rx_full_o,
+  output logic [7:0]  rx_qd_o,
+  output logic        rx_wm_o
+);
+
+  localparam int RxDepthW = prim_util_pkg::vbits(RxDepth+1);
+  localparam int TxDepthW = prim_util_pkg::vbits(TxDepth+1);
+
+  logic [31:0] tx_data_ordered;
+  logic [3:0]  tx_be_ordered;
+  logic [31:0] rx_data_unordered;
+
+  if (SwapBytes) begin : gen_swap
+    assign tx_data_ordered = { << 8 {tx_data_i} };
+    assign tx_be_ordered   = { << { tx_be_i} };
+    assign rx_data_o       = { << 8 { rx_data_unordered } };
+  end else begin : gen_do_not_swap
+    assign tx_data_ordered = tx_data_i;
+    assign tx_be_ordered   = tx_be_i;
+    assign rx_data_o       = rx_data_unordered;
+  end : gen_do_not_swap
+
+  logic [35:0]         tx_data_be;
+  logic [35:0]         core_tx_data_be;
+
+  logic [TxDepthW-1:0] tx_depth;
+
+  assign tx_qd_o = 8'(tx_depth);
+
+  assign tx_data_be = { tx_data_ordered, tx_be_ordered };
+  assign { core_tx_data_o, core_tx_be_o } = core_tx_data_be;
+
+  prim_fifo_sync #(
+    .Width(36),
+    .Pass(1),
+    .Depth(TxDepth)
+  ) u_tx_fifo (
+    .clk_i,
+    .rst_ni,
+    .clr_i    (sw_rst_i),
+    .wvalid_i (tx_valid_i),
+    .wready_o (tx_ready_o),
+    .wdata_i  (tx_data_be),
+    .rvalid_o (core_tx_valid_o),
+    .rready_i (core_tx_ready_i),
+    .rdata_o  (core_tx_data_be),
+    .full_o   (),
+    .depth_o  (tx_depth)
+  );
+
+  logic [RxDepthW-1:0] rx_depth;
+
+  assign rx_qd_o = 8'(rx_depth);
+
+  // TODO: update fifo_sync prim to allow unknown data
+  // (potentially optionally)
+  prim_fifo_sync #(
+    .Width(32),
+    .Pass(1),
+    .Depth(RxDepth)
+  ) u_rx_fifo (
+    .clk_i,
+    .rst_ni,
+    .clr_i    (sw_rst_i),
+    .wvalid_i (core_rx_valid_i),
+    .wready_o (core_rx_ready_o),
+    .wdata_i  (core_rx_data_i),
+    .rvalid_o (rx_valid_o),
+    .rready_i (rx_ready_i),
+    .rdata_o  (rx_data_unordered),
+    .full_o   (),
+    .depth_o  (rx_depth)
+  );
+
+  assign tx_empty_o = (tx_qd_o == 0);
+  assign rx_empty_o = (rx_qd_o == 0);
+  assign tx_full_o  = (tx_qd_o >= 8'(TxDepth));
+  assign rx_full_o  = (rx_qd_o >= 8'(RxDepth));
+  assign tx_wm_o    = (tx_qd_o >= tx_watermark_i);
+  assign rx_wm_o    = (rx_qd_o >= rx_watermark_i);
+
+endmodule : spi_host_data_fifos
diff --git a/hw/ip/spi_host/spi_host.core b/hw/ip/spi_host/spi_host.core
index dd97bff..ab78cc8 100644
--- a/hw/ip/spi_host/spi_host.core
+++ b/hw/ip/spi_host/spi_host.core
@@ -21,8 +21,8 @@
       - rtl/spi_host_byte_merge.sv
       - rtl/spi_host_fsm.sv
       - rtl/spi_host_core.sv
-      - rtl/spi_host_command_cdc.sv
-      - rtl/spi_host_data_cdc.sv
+      - rtl/spi_host_command_queue.sv
+      - rtl/spi_host_data_fifos.sv
       - rtl/spi_host_reg_top.sv
       - rtl/spi_host_window.sv
       - rtl/spi_host.sv