[prim_mubi*_sender] Add option to omit sender flops

Signed-off-by: Michael Schaffner <msf@opentitan.org>
diff --git a/hw/ip/prim/rtl/prim_mubi12_sender.sv b/hw/ip/prim/rtl/prim_mubi12_sender.sv
index 6664d51..8c6100c 100644
--- a/hw/ip/prim/rtl/prim_mubi12_sender.sv
+++ b/hw/ip/prim/rtl/prim_mubi12_sender.sv
@@ -15,6 +15,10 @@
 module prim_mubi12_sender
   import prim_mubi_pkg::*;
 #(
+  // This flops the output if set to 1.
+  // In special cases where the sender is in the same clock domain as the receiver,
+  // this can be set to 0. However, it is recommended to leave this at 1.
+  parameter bit AsyncOn = 1,
   // Reset value for the sender flops
   parameter mubi12_t ResetValue = MuBi12False
 ) (
@@ -27,15 +31,28 @@
   logic [MuBi12Width-1:0] mubi, mubi_out;
   assign mubi = MuBi12Width'(mubi_i);
 
-  prim_flop #(
-    .Width(MuBi12Width),
-    .ResetValue(MuBi12Width'(ResetValue))
-  ) u_prim_flop (
-    .clk_i,
-    .rst_ni,
-    .d_i   ( mubi     ),
-    .q_o   ( mubi_out )
-  );
+  if (AsyncOn) begin : gen_flops
+    prim_flop #(
+      .Width(MuBi12Width),
+      .ResetValue(MuBi12Width'(ResetValue))
+    ) u_prim_flop (
+      .clk_i,
+      .rst_ni,
+      .d_i   ( mubi     ),
+      .q_o   ( mubi_out )
+    );
+  end else begin : gen_no_flops
+    for (genvar k = 0; k < MuBi12Width; k++) begin : gen_bits
+      prim_buf u_prim_buf (
+        .in_i(mubi[k]),
+        .out_o(mubi_out[k])
+      );
+    end
+    logic unused_clk;
+    logic unused_rst;
+    assign unused_clk = clk_i;
+    assign unused_rst = rst_ni;
+  end
 
   assign mubi_o = mubi12_t'(mubi_out);
 
diff --git a/hw/ip/prim/rtl/prim_mubi16_sender.sv b/hw/ip/prim/rtl/prim_mubi16_sender.sv
index 90ddac3..05ff89d 100644
--- a/hw/ip/prim/rtl/prim_mubi16_sender.sv
+++ b/hw/ip/prim/rtl/prim_mubi16_sender.sv
@@ -15,6 +15,10 @@
 module prim_mubi16_sender
   import prim_mubi_pkg::*;
 #(
+  // This flops the output if set to 1.
+  // In special cases where the sender is in the same clock domain as the receiver,
+  // this can be set to 0. However, it is recommended to leave this at 1.
+  parameter bit AsyncOn = 1,
   // Reset value for the sender flops
   parameter mubi16_t ResetValue = MuBi16False
 ) (
@@ -27,15 +31,28 @@
   logic [MuBi16Width-1:0] mubi, mubi_out;
   assign mubi = MuBi16Width'(mubi_i);
 
-  prim_flop #(
-    .Width(MuBi16Width),
-    .ResetValue(MuBi16Width'(ResetValue))
-  ) u_prim_flop (
-    .clk_i,
-    .rst_ni,
-    .d_i   ( mubi     ),
-    .q_o   ( mubi_out )
-  );
+  if (AsyncOn) begin : gen_flops
+    prim_flop #(
+      .Width(MuBi16Width),
+      .ResetValue(MuBi16Width'(ResetValue))
+    ) u_prim_flop (
+      .clk_i,
+      .rst_ni,
+      .d_i   ( mubi     ),
+      .q_o   ( mubi_out )
+    );
+  end else begin : gen_no_flops
+    for (genvar k = 0; k < MuBi16Width; k++) begin : gen_bits
+      prim_buf u_prim_buf (
+        .in_i(mubi[k]),
+        .out_o(mubi_out[k])
+      );
+    end
+    logic unused_clk;
+    logic unused_rst;
+    assign unused_clk = clk_i;
+    assign unused_rst = rst_ni;
+  end
 
   assign mubi_o = mubi16_t'(mubi_out);
 
diff --git a/hw/ip/prim/rtl/prim_mubi4_sender.sv b/hw/ip/prim/rtl/prim_mubi4_sender.sv
index 7aa5abc..eab20fd 100644
--- a/hw/ip/prim/rtl/prim_mubi4_sender.sv
+++ b/hw/ip/prim/rtl/prim_mubi4_sender.sv
@@ -15,6 +15,10 @@
 module prim_mubi4_sender
   import prim_mubi_pkg::*;
 #(
+  // This flops the output if set to 1.
+  // In special cases where the sender is in the same clock domain as the receiver,
+  // this can be set to 0. However, it is recommended to leave this at 1.
+  parameter bit AsyncOn = 1,
   // Reset value for the sender flops
   parameter mubi4_t ResetValue = MuBi4False
 ) (
@@ -27,15 +31,28 @@
   logic [MuBi4Width-1:0] mubi, mubi_out;
   assign mubi = MuBi4Width'(mubi_i);
 
-  prim_flop #(
-    .Width(MuBi4Width),
-    .ResetValue(MuBi4Width'(ResetValue))
-  ) u_prim_flop (
-    .clk_i,
-    .rst_ni,
-    .d_i   ( mubi     ),
-    .q_o   ( mubi_out )
-  );
+  if (AsyncOn) begin : gen_flops
+    prim_flop #(
+      .Width(MuBi4Width),
+      .ResetValue(MuBi4Width'(ResetValue))
+    ) u_prim_flop (
+      .clk_i,
+      .rst_ni,
+      .d_i   ( mubi     ),
+      .q_o   ( mubi_out )
+    );
+  end else begin : gen_no_flops
+    for (genvar k = 0; k < MuBi4Width; k++) begin : gen_bits
+      prim_buf u_prim_buf (
+        .in_i(mubi[k]),
+        .out_o(mubi_out[k])
+      );
+    end
+    logic unused_clk;
+    logic unused_rst;
+    assign unused_clk = clk_i;
+    assign unused_rst = rst_ni;
+  end
 
   assign mubi_o = mubi4_t'(mubi_out);
 
diff --git a/hw/ip/prim/rtl/prim_mubi8_sender.sv b/hw/ip/prim/rtl/prim_mubi8_sender.sv
index c3f2b44..65d57fb 100644
--- a/hw/ip/prim/rtl/prim_mubi8_sender.sv
+++ b/hw/ip/prim/rtl/prim_mubi8_sender.sv
@@ -15,6 +15,10 @@
 module prim_mubi8_sender
   import prim_mubi_pkg::*;
 #(
+  // This flops the output if set to 1.
+  // In special cases where the sender is in the same clock domain as the receiver,
+  // this can be set to 0. However, it is recommended to leave this at 1.
+  parameter bit AsyncOn = 1,
   // Reset value for the sender flops
   parameter mubi8_t ResetValue = MuBi8False
 ) (
@@ -27,15 +31,28 @@
   logic [MuBi8Width-1:0] mubi, mubi_out;
   assign mubi = MuBi8Width'(mubi_i);
 
-  prim_flop #(
-    .Width(MuBi8Width),
-    .ResetValue(MuBi8Width'(ResetValue))
-  ) u_prim_flop (
-    .clk_i,
-    .rst_ni,
-    .d_i   ( mubi     ),
-    .q_o   ( mubi_out )
-  );
+  if (AsyncOn) begin : gen_flops
+    prim_flop #(
+      .Width(MuBi8Width),
+      .ResetValue(MuBi8Width'(ResetValue))
+    ) u_prim_flop (
+      .clk_i,
+      .rst_ni,
+      .d_i   ( mubi     ),
+      .q_o   ( mubi_out )
+    );
+  end else begin : gen_no_flops
+    for (genvar k = 0; k < MuBi8Width; k++) begin : gen_bits
+      prim_buf u_prim_buf (
+        .in_i(mubi[k]),
+        .out_o(mubi_out[k])
+      );
+    end
+    logic unused_clk;
+    logic unused_rst;
+    assign unused_clk = clk_i;
+    assign unused_rst = rst_ni;
+  end
 
   assign mubi_o = mubi8_t'(mubi_out);
 
diff --git a/util/design/data/prim_mubi_sender.sv.tpl b/util/design/data/prim_mubi_sender.sv.tpl
index 687ec57..f3f60c9 100644
--- a/util/design/data/prim_mubi_sender.sv.tpl
+++ b/util/design/data/prim_mubi_sender.sv.tpl
@@ -15,6 +15,10 @@
 module prim_mubi${n_bits}_sender
   import prim_mubi_pkg::*;
 #(
+  // This flops the output if set to 1.
+  // In special cases where the sender is in the same clock domain as the receiver,
+  // this can be set to 0. However, it is recommended to leave this at 1.
+  parameter bit AsyncOn = 1,
   // Reset value for the sender flops
   parameter mubi${n_bits}_t ResetValue = MuBi${n_bits}False
 ) (
@@ -27,15 +31,28 @@
   logic [MuBi${n_bits}Width-1:0] mubi, mubi_out;
   assign mubi = MuBi${n_bits}Width'(mubi_i);
 
-  prim_flop #(
-    .Width(MuBi${n_bits}Width),
-    .ResetValue(MuBi${n_bits}Width'(ResetValue))
-  ) u_prim_flop (
-    .clk_i,
-    .rst_ni,
-    .d_i   ( mubi     ),
-    .q_o   ( mubi_out )
-  );
+  if (AsyncOn) begin : gen_flops
+    prim_flop #(
+      .Width(MuBi${n_bits}Width),
+      .ResetValue(MuBi${n_bits}Width'(ResetValue))
+    ) u_prim_flop (
+      .clk_i,
+      .rst_ni,
+      .d_i   ( mubi     ),
+      .q_o   ( mubi_out )
+    );
+  end else begin : gen_no_flops
+    for (genvar k = 0; k < MuBi${n_bits}Width; k++) begin : gen_bits
+      prim_buf u_prim_buf (
+        .in_i(mubi[k]),
+        .out_o(mubi_out[k])
+      );
+    end
+    logic unused_clk;
+    logic unused_rst;
+    assign unused_clk = clk_i;
+    assign unused_rst = rst_ni;
+  end
 
   assign mubi_o = mubi${n_bits}_t'(mubi_out);