[spi_device] Add clock buffers to `clk_spi_in/out`

These clocks are derived from the `cio_sck_i` input and drive
multiple flops inside the SPI device. On FPGA, using clock
buffers enables routing these clocks on dedicated clock
routing resources and gives an anchor point to constrain them.
Otherwise, the tool is using regular signal routing for these
clocks which can lead to hold time violations.

This is related to lowRISC/OpenTitan#3606.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/spi_device/rtl/spi_device.sv b/hw/ip/spi_device/rtl/spi_device.sv
index 57cb0c1..ffb8ee6 100644
--- a/hw/ip/spi_device/rtl/spi_device.sv
+++ b/hw/ip/spi_device/rtl/spi_device.sv
@@ -46,8 +46,8 @@
   localparam int PtrW = SramAw + 1 + SDW;
   localparam int AsFifoDepthW = $clog2(FifoDepth+1);
 
-  logic clk_spi_in;   // clock for latch SDI
-  logic clk_spi_out;  // clock for driving SDO
+  logic clk_spi_in, clk_spi_in_buf;   // clock for latch SDI
+  logic clk_spi_out, clk_spi_out_buf; // clock for driving SDO
 
   spi_device_reg2hw_t reg2hw;
   spi_device_hw2reg_t hw2reg;
@@ -189,11 +189,11 @@
   );
 
   logic rxf_full_q, txf_empty_q;
-  always_ff @(posedge clk_spi_in or negedge rst_ni) begin
+  always_ff @(posedge clk_spi_in_buf or negedge rst_ni) begin
     if (!rst_ni) rxf_full_q <= 1'b0;
     else         rxf_full_q <= ~rxf_wready;
   end
-  always_ff @(posedge clk_spi_out or negedge rst_ni) begin
+  always_ff @(posedge clk_spi_out_buf or negedge rst_ni) begin
     if (!rst_ni) txf_empty_q <= 1'b1;
     else         txf_empty_q <= ~txf_rvalid;
   end
@@ -259,7 +259,7 @@
   //    Could trigger lint error for input clock.
   //    It's unavoidable due to the characteristics of SPI intf
   prim_pulse_sync u_rxf_overflow (
-    .clk_src_i   (clk_spi_in         ),
+    .clk_src_i   (clk_spi_in_buf     ),
     .rst_src_ni  (rst_ni             ),
     .src_pulse_i (rxf_overflow       ),
     .clk_dst_i   (clk_i              ),
@@ -271,7 +271,7 @@
   //    Could trigger lint error for input clock.
   //    It's unavoidable due to the characteristics of SPI intf
   prim_pulse_sync u_txf_underflow (
-    .clk_src_i   (clk_spi_out         ),
+    .clk_src_i   (clk_spi_out_buf     ),
     .rst_src_ni  (rst_ni              ),
     .src_pulse_i (txf_underflow       ),
     .clk_dst_i   (clk_i               ),
@@ -318,6 +318,15 @@
   assign clk_spi_in  = (cpha ^ cpol) ? sck_n    : cio_sck_i   ;
   assign clk_spi_out = (cpha ^ cpol) ? cio_sck_i    : sck_n   ;
 
+  prim_clock_buf u_clk_spi_in_buf(
+    .clk_i (clk_spi_in),
+    .clk_o (clk_spi_in_buf)
+  );
+  prim_clock_buf u_clk_spi_out_buf(
+    .clk_i (clk_spi_out),
+    .clk_o (clk_spi_out_buf)
+  );
+
   assign rst_spi_n = (scanmode_i) ? rst_ni : rst_ni & ~cio_csb_i;
 
   assign rst_txfifo_n = (scanmode_i) ? rst_ni : rst_ni & ~rst_txfifo_reg;
@@ -328,10 +337,10 @@
   // FW Mode //
   /////////////
   spi_fwmode u_fwmode (
-    .clk_in_i     (clk_spi_in),
+    .clk_in_i     (clk_spi_in_buf),
     .rst_in_ni    (rst_spi_n),
 
-    .clk_out_i    (clk_spi_out),
+    .clk_out_i    (clk_spi_out_buf),
     .rst_out_ni   (rst_spi_n),
 
     .cpha_i        (cpha),
@@ -363,7 +372,7 @@
     .Width (FifoWidth),
     .Depth (FifoDepth)
   ) u_rx_fifo (
-    .clk_wr_i     (clk_spi_in),
+    .clk_wr_i     (clk_spi_in_buf),
     .rst_wr_ni    (rst_rxfifo_n),
 
     .clk_rd_i     (clk_i),
@@ -388,7 +397,7 @@
     .clk_wr_i     (clk_i),
     .rst_wr_ni    (rst_txfifo_n),
 
-    .clk_rd_i     (clk_spi_out),
+    .clk_rd_i     (clk_spi_out_buf),
     .rst_rd_ni    (rst_txfifo_n),
 
     .wvalid_i     (txf_wvalid),
diff --git a/hw/ip/spi_device/spi_device.core b/hw/ip/spi_device/spi_device.core
index 19d22e5..840257b 100644
--- a/hw/ip/spi_device/spi_device.core
+++ b/hw/ip/spi_device/spi_device.core
@@ -10,6 +10,7 @@
     depend:
       - lowrisc:ip:tlul
       - lowrisc:prim:all
+      - lowrisc:prim:clock_buf
       - lowrisc:prim:clock_gating
       - lowrisc:prim:clock_inv
       - lowrisc:prim:ram_2p_adv