The option is controlled through xbar_main.hjson

By default all xbar componentns are marked with
'pipeline' = true
'pipeline_byp' = true

This means all elements (socket1n/socketm1) have both incoming and outgoing
FIFOs that are bypassable if the FIFO is empty

Setting `pipeline_byp` to false makes it a non-bypassable fifo and incurs
extra latency while giving better timing

Setting `pipeline` to false passes through the FIFO entirely and connects
input to output

The current scheme does the following:
remove pipelines for coreI/coreD to minimize process latency
remove pipeliens for ROM / RAM / FLASH to minimize instruction latency
keep pipelines to all other peripherals and memories
diff --git a/util/tlgen/elaborate.py b/util/tlgen/elaborate.py
index 2927fe7..71e95e1 100644
--- a/util/tlgen/elaborate.py
+++ b/util/tlgen/elaborate.py
@@ -82,7 +82,9 @@
         new_node = Node(name="sm1_" + str(len(xbar.nodes)),
                         node_type=NodeType.SOCKET_M1,
                         clock=xbar.clock)
+        new_node.hdepth = 2
         new_node.hpass = 2**len(node.us) - 1
+        new_node.ddepth = 2
         new_node.dpass = 1
         xbar.insert_node(new_node, node)
         process_node(new_node, xbar)
@@ -93,7 +95,9 @@
         new_node = Node(name="s1n_" + str(len(xbar.nodes)),
                         node_type=NodeType.SOCKET_1N,
                         clock=xbar.clock)
+        new_node.hdepth = 2
         new_node.hpass = 1
+        new_node.ddepth = 2
         new_node.dpass = 2**len(node.ds) - 1
         xbar.insert_node(new_node, node)
 
@@ -105,42 +109,62 @@
 
 
 def process_pipeline(xbar):
-    """Check if HOST, DEVICE has pipeline key and is True, then propagate it to end
+    """Check if HOST, DEVICE has settings different from default, then propagate it to end
     """
     for host in xbar.hosts:
-        # go downstream and set the HReqPass at the first instance.
+        # go downstream and change the HReqPass/Depth at the first instance.
         # If it is async, skip.
-        # If Socket 1N, set hpass to 1 and skip
-        # If Socket M1, find position of the host and set 1 of the bit in hpass skip
+        # If Socket 1N,
+        #    if pipeline True and bypass false, set hpass to 0
+        #    if pipeline is False, set depth to 0
+        # If Socket M1, find position of the host and follow procedure above
         # If it is device, it means host and device are directly connected. Ignore now.
 
         # After process node is done, always only one downstream exists in any host node
-        if host.pipeline == False:
-            # No need to process, default is Pass the req/rsp
+        if host.pipeline == True and host.pipeline_byp == True:
+            # No need to process, same as default
             continue
 
+        no_bypass = (host.pipeline == True and host.pipeline_byp == False)
         dnode = host.ds[0].ds
         if dnode.node_type == NodeType.SOCKET_1N:
-            dnode.hpass = 0
+            dnode.hpass = 0 if no_bypass else dnode.hpass
+
         elif dnode.node_type == NodeType.SOCKET_M1:
             idx = dnode.us.index(host.ds)
-            dnode.hpass = dnode.hpass ^ (1 << idx)
+            dnode.hpass = dnode.hpass ^ (
+                1 << idx) if no_bypass else dnode.hpass
+
+        # keep variables separate in case we ever need to differentiate
+        dnode.dpass = 0 if no_bypass else dnode.dpass
+        dnode.hdepth = 0 if host.pipeline == False else dnode.hdepth
+        dnode.ddepth = dnode.hdepth
 
     for device in xbar.devices:
         # go upstream and set DReq/RspPass at the first instance.
         # If it is async, skip
-        # If Socket 1N, set dpass to the bit position and skip
-        # If Socket M1, set dpass to 1 and skip
+        # If Socket M1
+        #    If pipeline True and bypass False, set dpass to 0
+        #    If pipeline False, set depth to 0
+        # If Socket 1N, find position of the device and follow procedure above
         # If it is host, ignore
 
-        if device.pipeline == False:
+        if device.pipeline == True and device.pipeline_byp == True:
             continue
 
+        no_bypass = (device.pipeline == True and device.pipeline_byp == False)
         unode = device.us[0].us
         if unode.node_type == NodeType.SOCKET_1N:
             idx = unode.ds.index(device.us)
-            unode.dpass = unode.dpass ^ (1 << idx)
+            unode.dpass = unode.dpass ^ (
+                1 << idx) if no_bypass else unode.dpass
+
         elif unode.node_type == NodeType.SOCKET_M1:
-            unode.dpass = 0
+            unode.dpass = 0 if no_bypass else unode.dpass
+
+        # keep variables separate in case we ever need to differentiate
+        unode.hpass = 0 if no_bypass else unode.hpass
+        unode.ddepth = 0 if device.pipeline == False else unode.ddepth
+        unode.hdepth = unode.ddepth
 
     return xbar
diff --git a/util/tlgen/item.py b/util/tlgen/item.py
index 116e275..b26b54e 100644
--- a/util/tlgen/item.py
+++ b/util/tlgen/item.py
@@ -54,9 +54,15 @@
     # 1 for Host, Device, 2 for Async FIFO, N for Sockets
     ds = []  # Edges
 
-    # Req/Rsp Pass. default False
+    # Req/Rsp FIFO. default False
+    # when False, FIFO fully passthrough, no storage element
+    # when True, FIFO present with default depth, "pipeline_byp"
+    # controls passthrough option
     pipeline = False
 
+    # FIFO passtru option. default True
+    pipeline_byp = True
+
     def __init__(self, name, node_type, clock):
         self.name = name
         self.node_type = node_type
diff --git a/util/tlgen/validate.py b/util/tlgen/validate.py
index 653b81f..e573a67 100644
--- a/util/tlgen/validate.py
+++ b/util/tlgen/validate.py
@@ -78,6 +78,9 @@
             node.pipeline = True if nodeobj["pipeline"].lower() in [
                 "true", "1"
             ] else False
+            node.pipeline_byp = True if nodeobj["pipeline_byp"].lower() in [
+                "true", "1"
+            ] else False
         xbar.nodes.append(node)
 
     # Edge
diff --git a/util/tlgen/xbar.rtl.tpl.sv b/util/tlgen/xbar.rtl.tpl.sv
index 0debad5..90685b6 100644
--- a/util/tlgen/xbar.rtl.tpl.sv
+++ b/util/tlgen/xbar.rtl.tpl.sv
@@ -184,18 +184,22 @@
   % elif block.node_type.name == "SOCKET_1N":
   tlul_socket_1n #(
     % if block.hpass != 1:
-    .HReqPass (1'b${block.hpass}),
-    .HRspPass (1'b${block.hpass}),
+    .HReqPass  (1'b${block.hpass}),
+    .HRspPass  (1'b${block.hpass}),
+    % endif
+    % if block.hdepth != 2:
+    .HReqDepth (4'h${block.hdepth}),
+    .HRspDepth (4'h${block.hdepth}),
     % endif
     % if block.dpass != 2**(len(block.ds)) -1:
-    .DReqPass (${len(block.ds)}'h ${"%x" % block.dpass}),
-    .DRspPass (${len(block.ds)}'h ${"%x" % block.dpass}),
+    .DReqPass  (${len(block.ds)}'h${"%x" % block.dpass}),
+    .DRspPass  (${len(block.ds)}'h${"%x" % block.dpass}),
     % endif
-    ## //.HReqDepth(),
-    ## //.HRspDepth(),
-    ## //.DReqDepth(),
-    ## //.DRspDepth(),
-    .N        (${len(block.ds)})
+    % if block.hdepth != 2:
+    .DReqDepth ({${len(block.ds)}{4'h${block.ddepth}}}),
+    .DRspDepth ({${len(block.ds)}{4'h${block.ddepth}}}),
+    % endif
+    .N         (${len(block.ds)})
   ) u_${block.name} (
     .clk_i        (clk_${xbar.clock}_i),
     .rst_ni       (rst_${xbar.clock}_ni),
@@ -207,19 +211,23 @@
   );
   % elif block.node_type.name == "SOCKET_M1":
   tlul_socket_m1 #(
-    % if block.hpass != 2**(len(block.us)) -1:
-    .HReqPass     (${len(block.us)}'h ${"%x" % block.hpass}),
-    .HRspPass     (${len(block.us)}'h ${"%x" % block.hpass}),
+    % if block.hpass != 2**(len(block.us)) - 1:
+    .HReqPass  (${len(block.us)}'h${"%x" % block.hpass}),
+    .HRspPass  (${len(block.us)}'h${"%x" % block.hpass}),
     % endif
-    ## //.HReqDepth    (),
-    ## //.HRspDepth    (),
+    % if block.hdepth != 2:
+    .HReqDepth ({${len(block.us)}{4'h${block.hdepth}}}),
+    .HRspDepth ({${len(block.us)}{4'h${block.hdepth}}}),
+    % endif
+    % if block.ddepth != 2:
+    .DReqDepth (4'h${block.ddepth}),
+    .DRspDepth (4'h${block.ddepth}),
+    % endif
     % if block.dpass != 1:
-    .DReqPass     (1'b${block.dpass}),
-    .DRspPass     (1'b${block.dpass}),
+    .DReqPass  (1'b${block.dpass}),
+    .DRspPass  (1'b${block.dpass}),
     % endif
-    ## //.DReqDepth    (),
-    ## //.DRspDepth    (),
-    .M            (${len(block.us)})
+    .M         (${len(block.us)})
   ) u_${block.name} (
     .clk_i        (clk_${xbar.clock}_i),
     .rst_ni       (rst_${xbar.clock}_ni),