Make WFI actually stall the core until an interrupt

- Add top-level signals, `wfi` and `irq` -- `wfi` is an output signaling
  that the core is waiting for an interrupt to proceed, and `irq` is an
  input for said interrupt. Now when the WFI instruction is decoded, the
  pipeline will be halted and the `wfi` signal raised.
- In CoreAxi, the top level interrupt signal is `irqn`, and is inverted
  before being passed to Core.

Change-Id: I49bb6e2e4ed07e0579d36f37f86ede7b33377852
diff --git a/hdl/chisel/src/kelvin/Core.scala b/hdl/chisel/src/kelvin/Core.scala
index 8acfaf2..7d2cbdc 100644
--- a/hdl/chisel/src/kelvin/Core.scala
+++ b/hdl/chisel/src/kelvin/Core.scala
@@ -40,6 +40,8 @@
     val csr = new CsrInOutIO(p)
     val halted = Output(Bool())
     val fault = Output(Bool())
+    val wfi = Output(Bool())
+    val irq = Input(Bool())
     val debug_req = Input(Bool())
 
     // Bus between core and instruction memories.
@@ -66,6 +68,8 @@
   io.ebus   <> score.io.ebus
   io.halted := score.io.halted
   io.fault  := score.io.fault
+  io.wfi    := score.io.wfi
+  score.io.irq := io.irq
   io.iflush <> score.io.iflush
   io.dflush <> score.io.dflush
   io.slog   := score.io.slog
diff --git a/hdl/chisel/src/kelvin/CoreAxi.scala b/hdl/chisel/src/kelvin/CoreAxi.scala
index 7aae784..fe48b87 100644
--- a/hdl/chisel/src/kelvin/CoreAxi.scala
+++ b/hdl/chisel/src/kelvin/CoreAxi.scala
@@ -42,6 +42,8 @@
     // Core status interrupts
     val halted = Output(Bool())
     val fault = Output(Bool())
+    val wfi = Output(Bool())
+    val irqn = Input(Bool())
     // Debug data interface
     val debug = new DebugIO(p)
     // String logging interface
@@ -91,8 +93,8 @@
     val csr = Module(new CoreAxiCSR(p))
     val cg = Module(new ClockGate())
     cg.io.clk_i := io.aclk
-    cg.io.enable := !csr.io.cg
     val core = withClockAndReset(cg.io.clk_o, csr.io.reset) { Core(p, coreModuleName) }
+    cg.io.enable := !io.irqn || (!csr.io.cg && !core.io.wfi)
     csr.io.kelvin_csr := core.io.csr.out
 
     val itcmBridge = Module(new AxiSlave2SRAM(p, log2Ceil(itcmEntries)))
@@ -128,6 +130,8 @@
     itcmBridge.io.periBusy := core.io.ibus.valid
     io.halted := core.io.halted
     io.fault := core.io.fault
+    io.wfi := core.io.wfi
+    core.io.irq := !io.irqn
     csr.io.halted := core.io.halted
     csr.io.fault := core.io.fault
     core.io.debug_req := true.B
@@ -196,6 +200,6 @@
 
     // Tie-offs
     core.io.dflush.ready := true.B
-    core.io.iflush.ready := false.B
+    core.io.iflush.ready := true.B
   }
 }
diff --git a/hdl/chisel/src/kelvin/scalar/Bru.scala b/hdl/chisel/src/kelvin/scalar/Bru.scala
index 6d1a4dd..926d2bb 100644
--- a/hdl/chisel/src/kelvin/scalar/Bru.scala
+++ b/hdl/chisel/src/kelvin/scalar/Bru.scala
@@ -41,6 +41,7 @@
   val MPAUSE = Value
   val MRET = Value
   val FENCEI = Value
+  val WFI = Value
   val UNDEF = Value
 }
 
@@ -127,7 +128,7 @@
       mret -> io.csr.out.mepc,
       ecall -> Cat(io.csr.out.mtvec(31,1), 0.U(1.W)),
       call -> io.csr.out.mepc,
-      (io.req.bits.fwd || (io.req.bits.op === BruOp.FENCEI)) -> pc4De,
+      (io.req.bits.fwd || (io.req.bits.op === BruOp.FENCEI) || (io.req.bits.op === BruOp.WFI)) -> pc4De,
       (io.req.bits.op === BruOp.JALR) -> io.target.data,
   ))
   stateReg.valid := io.req.valid
@@ -166,6 +167,7 @@
     BruOp.BGE    -> (ge  =/= stateReg.bits.fwd),
     BruOp.BLTU   -> (ltu =/= stateReg.bits.fwd),
     BruOp.BGEU   -> (geu =/= stateReg.bits.fwd),
+    BruOp.WFI    -> true.B,
   ))
   io.taken.value := stateReg.bits.target
 
@@ -217,17 +219,18 @@
   io.csr.in.mtval.valid := stateReg.valid && (undefFault || usageFault)
   io.csr.in.mtval.bits := stateReg.bits.pcEx
 
-  io.iflush := stateReg.valid && (op === BruOp.FENCEI)
+  io.iflush := stateReg.valid && op.isOneOf(BruOp.FENCEI, BruOp.WFI)
 
   // Pipeline will be halted.
   io.csr.in.halt := (stateReg.valid && (op === BruOp.MPAUSE) && (mode === CsrMode.Machine)) ||
                     io.csr.in.fault
   io.csr.in.fault := (undefFault && (mode === CsrMode.Machine)) || (usageFault && (mode === CsrMode.Machine))
+  io.csr.in.wfi := stateReg.valid && (op === BruOp.WFI)
 
   // Assertions.
   val ignore = op.isOneOf(BruOp.JAL, BruOp.JALR, BruOp.EBREAK, BruOp.ECALL,
                           BruOp.EEXIT, BruOp.EYIELD, BruOp.ECTXSW, BruOp.MPAUSE,
-                          BruOp.MRET, BruOp.FENCEI, BruOp.UNDEF)
+                          BruOp.MRET, BruOp.FENCEI, BruOp.UNDEF, BruOp.WFI)
 
   assert(!(stateReg.valid && !io.rs1.valid) || ignore)
   assert(!(stateReg.valid && !io.rs2.valid) || ignore)
diff --git a/hdl/chisel/src/kelvin/scalar/Csr.scala b/hdl/chisel/src/kelvin/scalar/Csr.scala
index 07ed2cd..88edcfe 100644
--- a/hdl/chisel/src/kelvin/scalar/Csr.scala
+++ b/hdl/chisel/src/kelvin/scalar/Csr.scala
@@ -55,6 +55,7 @@
     val mtval  = Valid(UInt(32.W))
     val halt   = Output(Bool())
     val fault  = Output(Bool())
+    val wfi    = Output(Bool())
   }
   val out = new Bundle {
     val mode  = Input(CsrMode())
@@ -97,6 +98,8 @@
     // Pipeline Control.
     val halted = Output(Bool())
     val fault  = Output(Bool())
+    val wfi    = Output(Bool())
+    val irq    = Input(Bool())
   })
 
   // Control registers.
@@ -104,7 +107,8 @@
 
   // Pipeline Control.
   val halted = RegInit(false.B)
-  val fault = RegInit(false.B)
+  val fault  = RegInit(false.B)
+  val wfi    = RegInit(false.B)
 
   // Machine(0)/User(1) Mode.
   val mode = RegInit(CsrMode.Machine)
@@ -194,10 +198,13 @@
     fault := true.B
   }
 
+  wfi := Mux(wfi, !io.irq, io.bru.in.wfi)
+
   io.halted := halted
   io.fault  := fault
+  io.wfi    := wfi
 
-  assert(!(io.fault && !io.halted))
+  assert(!(io.fault && !io.halted && !io.wfi))
 
   // Register state.
   val rs1 = io.rs1.data
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index a42da52..d85e281 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -231,6 +231,8 @@
   // The decode logic.
   val d = DecodeInstruction(p, pipeline, io.inst.bits.addr, io.inst.bits.inst)
 
+  val wfi = d.wfi
+
   val vldst = d.vld || d.vst
   val vldst_wb = vldst && io.inst.bits.inst(28)
 
@@ -334,6 +336,7 @@
     d.mpause -> MakeValid(true.B, BruOp.MPAUSE),
     d.mret   -> MakeValid(true.B, BruOp.MRET),
     d.fencei -> MakeValid(true.B, BruOp.FENCEI),
+    d.wfi    -> MakeValid(true.B, BruOp.WFI),
     d.undef  -> MakeValid(true.B, BruOp.UNDEF),
   ))
   io.bru.valid := decodeEn && bru.valid
@@ -365,10 +368,10 @@
     d.sb             -> MakeValid(true.B, LsuOp.SB),
     d.sh             -> MakeValid(true.B, LsuOp.SH),
     d.sw             -> MakeValid(true.B, LsuOp.SW),
+    d.wfi            -> MakeValid(true.B, LsuOp.FENCEI),
     d.fencei         -> MakeValid(true.B, LsuOp.FENCEI),
     d.flushat        -> MakeValid(true.B, LsuOp.FLUSHAT),
     d.flushall       -> MakeValid(true.B, LsuOp.FLUSHALL),
-    d.wfi            -> MakeValid(true.B, LsuOp.FENCEI),
     (d.vld || d.vst) -> MakeValid(true.B, LsuOp.VLDST),
   ))
   io.lsu.valid := decodeEn && lsu.valid
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index a077f88..efb7897 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -32,6 +32,8 @@
     val csr = new CsrInOutIO(p)
     val halted = Output(Bool())
     val fault = Output(Bool())
+    val wfi = Output(Bool())
+    val irq = Input(Bool())
 
     val ibus = new IBusIO(p)
     val dbus = new DBusIO(p)
@@ -108,7 +110,7 @@
     decode(i).io.inst.bits.brchFwd := fetch.io.inst.lanes(i).bits.brchFwd
 
     decode(i).io.branchTaken := branchTaken
-    decode(i).io.halted := csr.io.halted
+    decode(i).io.halted := csr.io.halted || csr.io.wfi
   }
 
   // Interlock based on regfile write port dependencies.
@@ -184,6 +186,8 @@
   // Status
   io.halted := csr.io.halted
   io.fault  := csr.io.fault
+  io.wfi    := csr.io.wfi
+  csr.io.irq := io.irq
 
   // ---------------------------------------------------------------------------
   // Load/Store Unit
diff --git a/hdl/chisel/src/matcha/Kelvin.scala b/hdl/chisel/src/matcha/Kelvin.scala
index 352940a..adeed49 100644
--- a/hdl/chisel/src/matcha/Kelvin.scala
+++ b/hdl/chisel/src/matcha/Kelvin.scala
@@ -91,6 +91,7 @@
     finish   := core.io.halted
     host_req := false.B
     fault    := core.io.fault
+    core.io.irq := false.B
 
     // -------------------------------------------------------------------------
     // Scalar Core logging.
diff --git a/tests/renode/rv_core/kelvin_hello_world.c b/tests/renode/rv_core/kelvin_hello_world.c
index 255532e..50bd6ef 100644
--- a/tests/renode/rv_core/kelvin_hello_world.c
+++ b/tests/renode/rv_core/kelvin_hello_world.c
@@ -54,6 +54,7 @@
   print_uint32(*our_pc_csr);
   print_string("beefb0ba\n");
   print_uint32(0xb0bacafeL);
+  asm volatile("wfi");
   asm volatile(".word 0x26000077");  // flushall
   return 0;
 }
diff --git a/tests/renode/sim_main.cc b/tests/renode/sim_main.cc
index 637ff7e..366ad09 100644
--- a/tests/renode/sim_main.cc
+++ b/tests/renode/sim_main.cc
@@ -47,6 +47,7 @@
 
   // First cycle, always evaluate regardless of what role asked.
   if (main_time == 0) {
+    top->io_irqn = true;
     top->eval();
     main_time++;
     return;
@@ -56,6 +57,18 @@
   if (main_time == last_tick) {
     return;
   } else {
+    // On rising-edges, check if the core is in WFI.
+    // If so, generate an interrupt pulse to wake it.
+    static bool irqn_state = true;
+    if (top->io_aclk) {
+      if (top->io_wfi && irqn_state) {
+        irqn_state = false;
+      }
+      if (!top->io_wfi && !irqn_state) {
+        irqn_state = true;
+      }
+    }
+    top->io_irqn = irqn_state;
     top->eval();
     last_tick = main_time;
     main_time++;
diff --git a/tests/verilator_sim/kelvin/core_mini_axi_sim.cc b/tests/verilator_sim/kelvin/core_mini_axi_sim.cc
index 6fb62bd..e7655c2 100644
--- a/tests/verilator_sim/kelvin/core_mini_axi_sim.cc
+++ b/tests/verilator_sim/kelvin/core_mini_axi_sim.cc
@@ -178,6 +178,8 @@
 struct CoreMiniAxi_tb : Sysc_tb {
   sc_in<bool> io_halted;
   sc_in<bool> io_fault;
+  sc_in<bool> io_wfi;
+  sc_out<bool> io_irqn;
 
   CoreMiniAxi_tb(sc_module_name n, int loops, bool random, std::string binary)
       : Sysc_tb(n, loops, random),
@@ -312,6 +314,17 @@
       tg_.addTransfers(status_read_transfer_.get(), 0,
                        CoreMiniAxi_tb::status_read_transfer_done_cb);
     }
+
+    static bool wfi_seen = false;
+    if (io_wfi && !wfi_seen) {
+      io_irqn = false;
+      wfi_seen = true;
+    } else if (wfi_seen) {
+      io_irqn = true;
+      wfi_seen = false;
+    } else {
+      io_irqn = true;
+    }
   }
 
   typedef AXISignals<KP_axi2AddrBits,  // ADDR_WIDTH
@@ -400,13 +413,19 @@
 
   sc_signal<bool> io_halted;
   sc_signal<bool> io_fault;
+  sc_signal<bool> io_wfi;
+  sc_signal<bool> io_irqn;
   tb.io_halted(io_halted);
   tb.io_fault(io_fault);
+  tb.io_wfi(io_wfi);
+  tb.io_irqn(io_irqn);
 
   core.io_aclk(tb.clock);
   core.io_aresetn(tb.resetn);
   core.io_halted(io_halted);
   core.io_fault(io_fault);
+  core.io_wfi(io_wfi);
+  core.io_irqn(io_irqn);
 
   SlogIO slog;
   core.io_slog_valid(slog.valid);
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc
index 082b6a8..e53922e 100644
--- a/tests/verilator_sim/kelvin/core_tb.cc
+++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -56,6 +56,8 @@
 
   sc_signal<bool> io_halted;
   sc_signal<bool> io_fault;
+  sc_signal<bool> io_wfi;
+  sc_signal<bool> io_irq;
   sc_signal<bool> io_debug_req;
   sc_signal<bool> io_ibus_valid;
   sc_signal<bool> io_ibus_ready;
@@ -129,6 +131,8 @@
   core.reset(tb.reset);
   core.io_halted(io_halted);
   core.io_fault(io_fault);
+  core.io_wfi(io_wfi);
+  core.io_irq(io_irq);
   core.io_debug_req(io_debug_req);
   core.io_ibus_valid(io_ibus_valid);
   core.io_ibus_ready(io_ibus_ready);