[prim] Add primitive for REQ/ACK synchronization

This commits adds a primitive for synchronizing REQ/ACK handshakes across
clock domain crossings. The primitive comes with a simple scratch
Verilator testbench.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/prim/pre_dv/prim_sync_reqack/README.md b/hw/ip/prim/pre_dv/prim_sync_reqack/README.md
new file mode 100644
index 0000000..77447b4
--- /dev/null
+++ b/hw/ip/prim/pre_dv/prim_sync_reqack/README.md
@@ -0,0 +1,34 @@
+REQ/ACK Syncronizer Verilator Testbench
+=======================================
+
+This directory contains a basic, scratch Verilator testbench targeting
+functional verification of the REQ/ACK synchronizer primitive during
+development.
+
+How to build and run the testbench
+----------------------------------
+
+From the OpenTitan top level execute
+
+   ```sh
+   fusesoc --cores-root=. run --setup --build \
+     lowrisc:dv_verilator:prim_sync_reqack_tb
+   ```
+to build the testbench and afterwards
+
+   ```sh
+   ./build/lowrisc_dv_verilator_prim_sync_reqack_tb_0/default-verilator/Vprim_sync_reqack_tb \
+     --trace
+   ```
+to run it.
+
+Details of the testbench
+------------------------
+
+- `rtl/prim_sync_reqack_tb.sv`: SystemVerilog testbench, instantiates and
+  drives the DUT, counts handshakes in both domains, signals test end and
+  result (pass/fail) to C++ via output ports. Change this file to e.g.
+  for a different clock ratio or more transactions.
+- `cpp/prim_sync_reqack_tb.cc`: Contains main function and instantiation of
+  SimCtrl, reads output ports of DUT and signals simulation termination to
+  Verilator.
diff --git a/hw/ip/prim/pre_dv/prim_sync_reqack/cpp/prim_sync_reqack_tb.cc b/hw/ip/prim/pre_dv/prim_sync_reqack/cpp/prim_sync_reqack_tb.cc
new file mode 100644
index 0000000..4b93a4c
--- /dev/null
+++ b/hw/ip/prim/pre_dv/prim_sync_reqack/cpp/prim_sync_reqack_tb.cc
@@ -0,0 +1,62 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "Vprim_sync_reqack_tb.h"
+#include "verilated_toplevel.h"
+#include "verilator_sim_ctrl.h"
+
+#include <signal.h>
+#include <functional>
+#include <iostream>
+
+#include "sim_ctrl_extension.h"
+
+class PrimSyncReqAckTB : public SimCtrlExtension {
+  using SimCtrlExtension::SimCtrlExtension;
+
+ public:
+  PrimSyncReqAckTB(prim_sync_reqack_tb *top);
+
+  void OnClock(unsigned long sim_time);
+
+ private:
+  prim_sync_reqack_tb *top_;
+};
+
+// Constructor:
+// - Set up top_ ptr
+PrimSyncReqAckTB::PrimSyncReqAckTB(prim_sync_reqack_tb *top)
+    : SimCtrlExtension{}, top_(top) {}
+
+// Function called once every clock cycle from SimCtrl
+void PrimSyncReqAckTB::OnClock(unsigned long sim_time) {
+  if (top_->test_done_o) {
+    VerilatorSimCtrl::GetInstance().RequestStop(top_->test_passed_o);
+  }
+}
+
+int main(int argc, char **argv) {
+  int ret_code;
+
+  // Init verilog instance
+  prim_sync_reqack_tb top;
+
+  // Init sim
+  VerilatorSimCtrl &simctrl = VerilatorSimCtrl::GetInstance();
+  simctrl.SetTop(&top, &top.clk_i, &top.rst_ni,
+                 VerilatorSimCtrlFlags::ResetPolarityNegative);
+
+  // Create and register VerilatorSimCtrl extension
+  PrimSyncReqAckTB primsyncreqacktb(&top);
+  simctrl.RegisterExtension(&primsyncreqacktb);
+
+  std::cout << "Simulation of REQ/ACK Synchronizer primitive" << std::endl
+            << "============================================" << std::endl
+            << std::endl;
+
+  // Get pass / fail from Verilator
+  ret_code = simctrl.Exec(argc, argv);
+
+  return ret_code;
+}
diff --git a/hw/ip/prim/pre_dv/prim_sync_reqack/prim_sync_reqack_tb.core b/hw/ip/prim/pre_dv/prim_sync_reqack/prim_sync_reqack_tb.core
new file mode 100644
index 0000000..bce88fb
--- /dev/null
+++ b/hw/ip/prim/pre_dv/prim_sync_reqack/prim_sync_reqack_tb.core
@@ -0,0 +1,52 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:dv_verilator:prim_sync_reqack_tb"
+description: "REQ/ACK Synchronizer Verilator TB"
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:prim:all
+    files:
+      - rtl/prim_sync_reqack_tb.sv
+    file_type: systemVerilogSource
+
+  files_dv_verilator:
+    depend:
+      - lowrisc:dv_verilator:simutil_verilator
+
+    files:
+      - cpp/prim_sync_reqack_tb.cc
+    file_type: cppSource
+
+targets:
+  default:
+    default_tool: verilator
+    filesets:
+      - files_rtl
+      - files_dv_verilator
+    toplevel: prim_sync_reqack_tb
+    tools:
+      verilator:
+        mode: cc
+        verilator_options:
+# Disabling tracing reduces compile times by multiple times, but doesn't have a
+# huge influence on runtime performance. (Based on early observations.)
+          - '--trace'
+          - '--trace-fst' # this requires -DVM_TRACE_FMT_FST in CFLAGS below!
+          - '--trace-structs'
+          - '--trace-params'
+          - '--trace-max-array 1024'
+# compiler flags
+#
+# -O
+#   Optimization levels have a large impact on the runtime performance of the
+#   simulation model. -O2 and -O3 are pretty similar, -Os is slower than -O2/-O3
+          - '-CFLAGS "-std=c++11 -Wall -DVM_TRACE_FMT_FST -DTOPLEVEL_NAME=prim_sync_reqack_tb -g -O0"'
+          - '-LDFLAGS "-pthread -lutil -lelf"'
+          - "-Wall"
+          - "-Wno-PINCONNECTEMPTY"
+          # XXX: Cleanup all warnings and remove this option
+          # (or make it more fine-grained at least)
+          - "-Wno-fatal"
diff --git a/hw/ip/prim/pre_dv/prim_sync_reqack/rtl/prim_sync_reqack_tb.sv b/hw/ip/prim/pre_dv/prim_sync_reqack/rtl/prim_sync_reqack_tb.sv
new file mode 100644
index 0000000..760c621
--- /dev/null
+++ b/hw/ip/prim/pre_dv/prim_sync_reqack/rtl/prim_sync_reqack_tb.sv
@@ -0,0 +1,173 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Scratch verification testbench for REQ/ACK synchronizer primitive
+
+module prim_sync_reqack_tb #(
+) (
+  input  logic clk_i,
+  input  logic rst_ni,
+
+  output logic test_done_o,
+  output logic test_passed_o
+);
+
+  // TB configuration
+  localparam int unsigned NumTransactions = 8;
+  localparam logic        FastToSlow = 1'b1; // Select 1'b0 for SlowToFast
+  localparam int unsigned Ratio = 4; // must be even and greater equal 2
+
+  // Derivation of parameters
+  localparam int unsigned Ticks = Ratio/2;
+  localparam int unsigned WidthTicks = $clog2(Ticks)+1;
+  localparam int unsigned WidthTrans = $clog2(NumTransactions)+1;
+
+  // Derive slow clock (using a counter)
+  logic [WidthTicks-1:0] count_clk_d, count_clk_q;
+  assign count_clk_d = count_clk_q == (Ticks[WidthTicks-1:0]-1) ? '0 : count_clk_q + {{WidthTicks-1{1'b0}},{1'b1}};
+  always_ff @(posedge clk_i) begin : reg_count_clk
+    count_clk_q <= count_clk_d;
+  end
+
+  logic clk_slow_d, clk_slow_q, clk_slow;
+  assign clk_slow_d = count_clk_q == (Ticks[WidthTicks-1:0]-1) ? !clk_slow_q : clk_slow_q;
+  always_ff @(posedge clk_i) begin : reg_clk_slow
+    clk_slow_q <= clk_slow_d;
+  end
+  assign clk_slow = clk_slow_q;
+
+  // Sync reset to slow clock
+  logic [1:0] rst_slow_nq;
+  logic       rst_slow_n;
+  always_ff @(posedge clk_slow) begin
+    rst_slow_nq <= {rst_slow_nq[0], rst_ni};
+  end
+  assign rst_slow_n = rst_ni & rst_slow_nq[1];
+
+  // Connect clocks
+  logic clk_src, clk_dst;
+  assign clk_src = FastToSlow ? clk_i    : clk_slow;
+  assign clk_dst = FastToSlow ? clk_slow : clk_i;
+
+  logic src_req, dst_req;
+  logic src_ack, dst_ack;
+  logic rst_done;
+
+  // Instantiate DUT
+  prim_sync_reqack prim_sync_reqack (
+    .clk_src_i  (clk_src),
+    .rst_src_ni (rst_slow_n),
+    .clk_dst_i  (clk_dst),
+    .rst_dst_ni (rst_slow_n),
+
+    .src_req_i  (src_req),
+    .src_ack_o  (src_ack),
+    .dst_req_o  (dst_req),
+    .dst_ack_i  (dst_ack)
+  );
+
+  // Make sure we do not apply stimuli before the reset.
+  always_ff @(posedge clk_slow or negedge rst_slow_n) begin
+    if (!rst_slow_n) begin
+      rst_done <= '1;
+    end else begin
+      rst_done <= rst_done;
+    end
+  end
+
+  // Create randomized ACK delay
+  localparam int WIDTH_COUNT = 3;
+  logic [31:0]             tmp;
+  logic [31-WIDTH_COUNT:0] unused_tmp;
+  assign unused_tmp = tmp[31:WIDTH_COUNT];
+  logic [WIDTH_COUNT-1:0]  dst_count_clk_d, dst_count_clk_q;
+  logic [WIDTH_COUNT-1:0]  dst_count_clk_max_d, dst_count_clk_max_q;
+  logic                    count_exp;
+  assign count_exp = dst_count_clk_q == dst_count_clk_max_q;
+  always_comb begin
+    dst_count_clk_d     = dst_count_clk_q;
+    dst_count_clk_max_d = dst_count_clk_max_q;
+    tmp                 = '0;
+    if (dst_req && count_exp) begin
+      // Clear counter
+      dst_count_clk_d = '0;
+      // Get new max
+      tmp = $random;
+      dst_count_clk_max_d = tmp[2:0];
+    end else if (dst_req) begin
+      // Increment
+      dst_count_clk_d = dst_count_clk_q + {{WIDTH_COUNT-1{1'b0}},{1'b1}};
+    end
+  end
+  always_ff @(posedge clk_dst or negedge rst_slow_n) begin : reg_dst_count_clk
+    if (!rst_slow_n) begin
+      dst_count_clk_q     <= '0;
+      dst_count_clk_max_q <= '0;
+    end else begin
+      dst_count_clk_q     <= dst_count_clk_d;
+      dst_count_clk_max_q <= dst_count_clk_max_d;
+    end
+  end
+
+  // Apply stimuli
+  always_comb begin
+
+    src_req = 1'b0;
+    dst_ack = 1'b0;
+
+    if (rst_done && rst_slow_n) begin
+      // The source wants to perform handshakes at maximum rate.
+      src_req = 1'b1;
+    end
+
+    if (dst_req && count_exp) begin
+      // The destination sends the ACK after a random delay.
+      dst_ack = 1'b1;
+    end
+  end
+
+  // Count handshakes on both sides
+  logic [WidthTrans-1:0] src_count_d, src_count_q;
+  logic [WidthTrans-1:0] dst_count_d, dst_count_q;
+  assign src_count_d = (src_req && src_ack) ? src_count_q + 1'b1 : src_count_q;
+  always_ff @(posedge clk_src or negedge rst_slow_n) begin : reg_src_count
+    if (!rst_slow_n) begin
+      src_count_q <= '0;
+    end else begin
+      src_count_q <= src_count_d;
+    end
+  end
+  assign dst_count_d = (dst_req && dst_ack) ? dst_count_q + 1'b1 : dst_count_q;
+  always_ff @(posedge clk_dst or negedge rst_slow_n) begin : reg_dst_count
+    if (!rst_slow_n) begin
+      dst_count_q <= '0;
+    end else begin
+      dst_count_q <= dst_count_d;
+    end
+  end
+
+  // Check responses, signal end of simulation
+  always_ff @(posedge clk_i) begin : tb_ctrl
+    test_done_o   <= 1'b0;
+    test_passed_o <= 1'b1;
+
+    if ((src_count_q == NumTransactions[WidthTrans-1:0]) &&
+        (dst_count_q == NumTransactions[WidthTrans-1:0])) begin // Success
+
+      $display("\nSUCCESS: Performed %0d handshakes in both source and destination domain.",
+          NumTransactions);
+      $display("Finishing simulation now.\n");
+      test_passed_o <= 1'b1;
+      test_done_o   <= 1'b1;
+    end else if (((src_count_q > dst_count_q) && ((src_count_q - dst_count_q) > 1)) ||
+                 ((dst_count_q > src_count_q) && ((dst_count_q - src_count_q) > 1))) begin // Failed
+      $display("\nERROR: Performed %0d handshakes in source domain, and %0d in destination domain.",
+          src_count_q, dst_count_q);
+      $display("Finishing simulation now.\n");
+      test_passed_o <= 1'b0;
+      test_done_o   <= 1'b1;
+    end
+  end
+
+endmodule
diff --git a/hw/ip/prim/prim.core b/hw/ip/prim/prim.core
index 5a396e6..20d0867 100644
--- a/hw/ip/prim/prim.core
+++ b/hw/ip/prim/prim.core
@@ -29,6 +29,7 @@
       - rtl/prim_fifo_async.sv
       - rtl/prim_fifo_sync.sv
       - rtl/prim_flop_2sync.sv
+      - rtl/prim_sync_reqack.sv
       - rtl/prim_keccak.sv
       - rtl/prim_lfsr.sv
       - rtl/prim_packer.sv
diff --git a/hw/ip/prim/rtl/prim_sync_reqack.sv b/hw/ip/prim/rtl/prim_sync_reqack.sv
new file mode 100644
index 0000000..5484898
--- /dev/null
+++ b/hw/ip/prim/rtl/prim_sync_reqack.sv
@@ -0,0 +1,158 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// REQ/ACK synchronizer
+//
+// This module synchronizes a REQ/ACK handshake across a clock domain crossing.
+// Both domains will see a handshake with the duration of one clock cycle.
+//
+// Notes:
+// - Once asserted, the source domain is not allowed to de-assert REQ without ACK.
+// - The destination domain is not allowed to send an ACK without a REQ.
+// - This module works both when syncing from a faster to a slower clock domain and vice versa.
+// - Internally, this module uses a return-to-zero, four-phase handshake protocol. Assuming the
+//   destination side responds with an ACK immediately, the latency from asserting the REQ on the
+//   source side is:
+//   - 1 source + 2 destination clock cycles until the handshake is performed on the
+//     destination side,
+//   - 1 source + 2 destination + 1 destination + 2 source clock cycles until the handshake is
+//     performed on the source side.
+//   - It takes another round trip (3 source + 3 destination clock cycles) before the next
+//     REQ is starting to be propagated to the destination side. The module is thus not suitable
+//     for high-bandwidth communication.
+
+`include "prim_assert.sv"
+
+module prim_sync_reqack (
+  input  clk_src_i,       // REQ side, SRC domain
+  input  rst_src_ni,      // REQ side, SRC domain
+  input  clk_dst_i,       // ACK side, DST domain
+  input  rst_dst_ni,      // ACK side, DST domain
+
+  input  logic src_req_i, // REQ side, SRC domain
+  output logic src_ack_o, // REQ side, SRC domain
+  output logic dst_req_o, // ACK side, DST domain
+  input  logic dst_ack_i  // ACK side, DST domain
+);
+
+  // Types
+  typedef enum logic {
+    HANDSHAKE, SYNC
+  } sync_reqack_fsm_e;
+
+  // Signals
+  sync_reqack_fsm_e src_fsm_ns, src_fsm_cs;
+  sync_reqack_fsm_e dst_fsm_ns, dst_fsm_cs;
+  logic src_req_d, src_req_q, src_ack;
+  logic dst_ack_d, dst_ack_q, dst_req;
+
+  // Move REQ over to ACK side.
+  prim_flop_2sync #(
+    .Width(1)
+  ) req_sync (
+    .clk_i  (clk_dst_i),
+    .rst_ni (rst_dst_ni),
+    .d      (src_req_q),
+    .q      (dst_req)
+  );
+
+  // Move ACK over to REQ side.
+  prim_flop_2sync #(
+    .Width(1)
+  ) ack_sync (
+    .clk_i  (clk_src_i),
+    .rst_ni (rst_src_ni),
+    .d      (dst_ack_q),
+    .q      (src_ack)
+  );
+
+  // REQ-side FSM (source domain)
+  always_comb begin : src_fsm
+    src_fsm_ns = src_fsm_cs;
+
+    // By default, we forward the REQ and ACK.
+    src_req_d = src_req_i;
+    src_ack_o = src_ack;
+
+    unique case (src_fsm_cs)
+
+      HANDSHAKE: begin
+        // The handshake on the REQ side is done for exactly 1 clock cycle.
+        if (src_req_i && src_ack) begin
+          src_fsm_ns = SYNC;
+          // Tell ACK side that we are done.
+          src_req_d  = 1'b0;
+        end
+      end
+
+      SYNC: begin
+        // Make sure ACK side knows that we are done.
+        src_req_d = 1'b0;
+        src_ack_o = 1'b0;
+        if (!src_ack) begin
+          src_fsm_ns = HANDSHAKE;
+        end
+      end
+
+      default: ;
+    endcase
+  end
+
+  // ACK-side FSM (destination domain)
+  always_comb begin : dst_fsm
+    dst_fsm_ns = dst_fsm_cs;
+
+    // By default, we forward the REQ and ACK.
+    dst_req_o = dst_req;
+    dst_ack_d = dst_ack_i;
+
+    unique case (dst_fsm_cs)
+
+      HANDSHAKE: begin
+        // The handshake on the ACK side is done for exactly 1 clock cycle.
+        if (dst_req && dst_ack_i) begin
+          dst_fsm_ns = SYNC;
+        end
+      end
+
+      SYNC: begin
+        // Don't forward REQ, hold ACK, wait for REQ side.
+        dst_req_o  = 1'b0;
+        dst_ack_d  = 1'b1;
+        if (!dst_req) begin
+          dst_fsm_ns = HANDSHAKE;
+        end
+      end
+
+      default: ;
+    endcase
+  end
+
+  // Registers
+  always_ff @(posedge clk_src_i or negedge rst_src_ni) begin
+    if (!rst_src_ni) begin
+      src_fsm_cs <= HANDSHAKE;
+      src_req_q  <= 1'b0;
+    end else begin
+      src_fsm_cs <= src_fsm_ns;
+      src_req_q  <= src_req_d;
+    end
+  end
+  always_ff @(posedge clk_dst_i or negedge rst_dst_ni) begin
+    if (!rst_dst_ni) begin
+      dst_fsm_cs <= HANDSHAKE;
+      dst_ack_q  <= 1'b0;
+    end else begin
+      dst_fsm_cs <= dst_fsm_ns;
+      dst_ack_q  <= dst_ack_d;
+    end
+  end
+
+  // Source domain cannot de-assert REQ while waiting for ACK.
+  `ASSERT(ReqAckSyncHoldReq, $fell(src_req_i) |-> (src_fsm_cs != HANDSHAKE), clk_src_i, rst_src_ni)
+
+  // Destination domain cannot assert ACK without REQ.
+  `ASSERT(ReqAckSyncAckNeedsReq, dst_ack_i |-> dst_req_o, clk_dst_i, rst_dst_ni)
+
+endmodule