Add optional delays on AXI read addr/data for TCM & CSR

This helps with timing.

Change-Id: I74a3246b226005c9d21b4d1313aab008aec75899
diff --git a/hdl/chisel/src/kelvin/AxiSlave2SRAM.scala b/hdl/chisel/src/kelvin/AxiSlave2SRAM.scala
index f282b22..3dcd05a 100644
--- a/hdl/chisel/src/kelvin/AxiSlave2SRAM.scala
+++ b/hdl/chisel/src/kelvin/AxiSlave2SRAM.scala
@@ -59,7 +59,10 @@
   io.fabric.writeResp := true.B
 }
 
-class AxiSlave2SRAM(p: Parameters, sramAddressWidth: Int) extends Module {
+class AxiSlave2SRAM(p: Parameters,
+                    sramAddressWidth: Int,
+                    axiReadAddrDelay: Int = 0,
+                    axiReadDataDelay: Int = 0) extends Module {
   val io = IO(new Bundle{
     val axi = Flipped(new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits))
     val sram = new SRAMIO(p, sramAddressWidth)
@@ -70,7 +73,11 @@
   })
 
   val axi = Module(new AxiSlave(p))
-  axi.io.axi <> io.axi
+  io.axi.write <> axi.io.axi.write
+  // Optionally delay AXI read channel. This helps break up a long timing path
+  // from SRAM.
+  axi.io.axi.read.addr <> Queue(io.axi.read.addr, axiReadAddrDelay)
+  io.axi.read.data <> Queue(axi.io.axi.read.data, axiReadDataDelay)
   axi.io.periBusy := io.periBusy
   io.txnInProgress := axi.io.txnInProgress
 
diff --git a/hdl/chisel/src/kelvin/CoreAxi.scala b/hdl/chisel/src/kelvin/CoreAxi.scala
index c28013d..d7e03d6 100644
--- a/hdl/chisel/src/kelvin/CoreAxi.scala
+++ b/hdl/chisel/src/kelvin/CoreAxi.scala
@@ -82,14 +82,15 @@
         new SRAMInterface(dtcmEntries, Vec(dtcmSubEntries, UInt(dtcmSubEntryWidth.W)), 0, 0, 1, true),
       2))
 
-    val csr = Module(new CoreAxiCSR(p))
+    val csr = Module(new CoreAxiCSR(p, axiReadAddrDelay=1, axiReadDataDelay=0))
     val cg = Module(new ClockGate())
     cg.io.clk_i := rst_sync.io.clk_o
     val core = withClockAndReset(cg.io.clk_o, csr.io.reset.asAsyncReset) { Core(p, coreModuleName) }
     cg.io.enable := io.irq || (!csr.io.cg && !core.io.wfi)
     csr.io.kelvin_csr := core.io.csr.out
 
-    val itcmBridge = Module(new AxiSlave2SRAM(p, log2Ceil(itcmEntries)))
+    val itcmBridge = Module(new AxiSlave2SRAM(
+        p, log2Ceil(itcmEntries), axiReadAddrDelay=1, axiReadDataDelay=1))
 
     itcmArbiter.io.in(0).bits.readwritePorts(0).address := itcmBridge.io.sram.address
     itcmArbiter.io.in(0).bits.readwritePorts(0).enable := itcmBridge.io.sram.enable
@@ -128,7 +129,8 @@
     csr.io.fault := core.io.fault
     core.io.debug_req := true.B
 
-    val dtcmBridge = Module(new AxiSlave2SRAM(p, log2Ceil(dtcmEntries)))
+    val dtcmBridge = Module(new AxiSlave2SRAM(
+        p, log2Ceil(dtcmEntries), axiReadAddrDelay=1, axiReadDataDelay=1))
     dtcmBridge.io.periBusy := core.io.dbus.valid
 
     dtcmArbiter.io.in(0).bits.readwritePorts(0).address := dtcmBridge.io.sram.address
diff --git a/hdl/chisel/src/kelvin/CoreAxiCSR.scala b/hdl/chisel/src/kelvin/CoreAxiCSR.scala
index 87a44c1..3c7900e 100644
--- a/hdl/chisel/src/kelvin/CoreAxiCSR.scala
+++ b/hdl/chisel/src/kelvin/CoreAxiCSR.scala
@@ -68,7 +68,9 @@
   ))
 }
 
-class CoreAxiCSR(p: Parameters) extends Module {
+class CoreAxiCSR(p: Parameters,
+                    axiReadAddrDelay: Int = 0,
+                    axiReadDataDelay: Int = 0) extends Module {
   val io = IO(new Bundle {
     val axi = Flipped(new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits))
 
@@ -81,7 +83,11 @@
   })
 
   val axi = Module(new AxiSlave(p))
-  axi.io.axi <> io.axi
+  // Optionally delay AXI read channel. This helps break up single cycle read path into into multi cycle as necessary to meet timing
+  io.axi.write <> axi.io.axi.write
+  axi.io.axi.read.addr <> Queue(io.axi.read.addr, axiReadAddrDelay)
+  io.axi.read.data <> Queue(axi.io.axi.read.data, axiReadDataDelay)
+
   axi.io.periBusy := false.B
 
   val csr = Module(new CoreCSR(p))