Respect queue counts in Lsu and Rvv.
Use the queue counts from Lsu and Rvv instead of
ready/valid to determine if an instruction can be issued.
Change-Id: I2f3915f702ab465c641a0930340e56c326905129
diff --git a/hdl/chisel/src/kelvin/rvv/RvvCore.scala b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
index 43503b2..f039c91 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvCore.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
@@ -113,6 +113,7 @@
| output [2:0] configSew,
| output [2:0] configLmul,
| output logic rvv_idle,
+ | output logic [3:0] queue_capacity,
|""".stripMargin.replaceAll("VSTART_LEN", (log2Ceil(vlen) - 1).toString)
@@ -257,7 +258,8 @@
| .vcsr_ready(vcsr_ready),
| .config_state_valid(configStateValid),
| .config_state(config_state),
- | .rvv_idle(rvv_idle)
+ | .rvv_idle(rvv_idle),
+ | .queue_capacity(queue_capacity)
|""".stripMargin.replaceAll("GENN", instructionLanes.toString)
coreInstantiation += " );\n"
@@ -322,6 +324,7 @@
val configSew = Output(UInt(3.W))
val configLmul = Output(UInt(3.W))
val rvv_idle = Output(Bool())
+ val queue_capacity = Output(UInt(4.W))
})
// Resources must be sorted topologically by dependency DAG
@@ -420,6 +423,7 @@
io.configState.bits.sew := rvvCoreWrapper.io.configSew
io.configState.bits.lmul := rvvCoreWrapper.io.configLmul
io.rvv_idle := rvvCoreWrapper.io.rvv_idle
+ io.queue_capacity := rvvCoreWrapper.io.queue_capacity
val vstart_wdata = MuxCase(vstart, Seq(
rvvCoreWrapper.io.vcsr_valid -> rvvCoreWrapper.io.vcsr_vstart,
@@ -436,3 +440,4 @@
io.csr.vstart := vstart
io.csr.vxrm := vxrm
}
+
diff --git a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
index 6c68488..babad57 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
@@ -68,6 +68,7 @@
val csr = new RvvCsrIO(p)
val rvv_idle = Output(Bool())
+ val queue_capacity = Output(UInt(4.W))
}
class RvvCsrIO(p: Parameters) extends Bundle {
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index 5e98079..6e05ab4 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -192,7 +192,14 @@
def isFloatStore(): Bool = {
float.map(f => f.valid && f.bits.opcode === FloatOpcode.STOREFP).getOrElse(false.B)
}
- def isLsu(): Bool = { isScalarLoad() || isScalarStore() || vld || vst || flushat || flushall || isFloatLoad() || isFloatStore() }
+ def isLsu(): Bool = {
+ isScalarLoad() || isScalarStore() || vld || vst || flushat || flushall ||
+ isFloatLoad() || isFloatStore() || (if (p.enableRvv) {
+ rvv.get.valid && rvv.get.bits.isLoadStore()
+ } else {
+ false.B
+ })
+ }
def isMul(): Bool = { mul || mulh || mulhsu || mulhu }
def isDvu(): Bool = { div || divu || rem || remu }
def isVector(): Bool = { vld || vst || viop || getvl || getmaxvl }
@@ -284,6 +291,7 @@
// LSU interface.
val lsu = Vec(p.instructionLanes, Decoupled(new LsuCmd(p)))
+ val lsuQueueCapacity = Input(UInt(3.W))
// Multiplier interface.
val mlu = Vec(p.instructionLanes, Decoupled(new MluCmd))
@@ -296,6 +304,7 @@
Vec(p.instructionLanes, Decoupled(new RvvCompressedInstruction)))
val rvvState = Option.when(p.enableRvv)(Input(Valid(new RvvConfigState(p))))
val rvvIdle = Option.when(p.enableRvv)(Input(Bool()))
+ val rvvQueueCapacity = Option.when(p.enableRvv)(Input(UInt(4.W)))
// Vector interface, to maintain interface compatibility with old dispatch
// unit.
@@ -409,9 +418,9 @@
val fence = decodedInsts.map(x => x.isFency() && (io.mactive || io.lsuActive))
// ---------------------------------------------------------------------------
- // Rvv interlock rules
+ // Rvv config interlock rules
// RVV Load store unit requires valid config state on dispatch.
- val rvvInterlock = if (p.enableRvv) {
+ val rvvConfigInterlock = if (p.enableRvv) {
val configChange = decodedInsts.map(
x => x.rvv.get.valid && x.rvv.get.bits.isVset())
val configInvalid = configChange.scan(!io.rvvState.get.valid)(_ || _)
@@ -426,19 +435,29 @@
}
// ---------------------------------------------------------------------------
- // Load/Store interlock
- // Only dispatch one RVV load/store per cycle.
- // TODO(derekjchow): Relax this when LsuV2 can accept multiple ops
- val rvvLsuInterlock = if (p.enableRvv) {
- val isRvvLsu = decodedInsts.map(
- x => x.rvv.get.valid && x.rvv.get.bits.isLoadStore())
- val rvvSet = isRvvLsu.scan(false.B)(_ || _)
- rvvSet.map(!_).take(p.instructionLanes)
+ // Rvv Interlock
+ val rvvInterlock = if (p.enableRvv) {
+ val isRvv = decodedInsts.map(x => x.rvv.get.valid)
+ val isRvvCount = isRvv.scan(0.U(4.W))(_+_)
+ (0 until p.instructionLanes).map(
+ i => isRvvCount(i) < io.rvvQueueCapacity.get)
} else {
Seq.fill(p.instructionLanes)(true.B)
}
// ---------------------------------------------------------------------------
+ // LSU Interlock
+ val isLsu = decodedInsts.map(x => x.isLsu())
+ val isLsuCount = isLsu.scan(0.U(4.W))(_+_)
+ val lsuInterlock = if (p.useLsuV2) {
+ (0 until p.instructionLanes).map(
+ i => isLsuCount(i) < io.lsuQueueCapacity)
+ } else {
+ // For LSU V1, backpressure from ready/valid handshake to interlock LSU.
+ Seq.fill(p.instructionLanes)(true.B)
+ }
+
+ // ---------------------------------------------------------------------------
// Undef
// Ensure undef op is only handled in the first slot
val undefInterlock = (0 until p.instructionLanes).map(i =>
@@ -478,8 +497,10 @@
!floatWriteAfterWrite(i) && // Avoid WAW hazards
!branchInterlock(i) && // Only branch/alu can be dispatched after a branch
!fence(i) && // Don't dispatch if fence interlocked
- rvvInterlock(i) && // Rvv interlock rules
- rvvLsuInterlock(i) && // Dispatch only one Rvv LsuOp
+ rvvConfigInterlock(i) && // Rvv interlock rules
+ // rvvLsuInterlock(i) && // Dispatch only one Rvv LsuOp
+ lsuInterlock(i) && // Ensure lsu instructions can be dispatched into queue
+ rvvInterlock(i) && // Ensure rvv instructions can be dispatched into queue
!undefInterlock(i) && // Ensure undef is only dispatched from first slot
io.retirement_buffer_nSpace.map(x => i.U < x).getOrElse(true.B) && // Retirement buffer needs space for our slot
singleStepInterlock(i)
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index e3e390f..8cbe82c 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -57,6 +57,7 @@
val rvvState = Option.when(p.enableRvv)(Input(Valid(new RvvConfigState(p))))
val storeCount = Output(UInt(2.W))
+ val queueCapacity = Output(UInt(3.W))
val active = Output(Bool())
})
}
@@ -943,6 +944,8 @@
val io_rd_flt_pipe = Pipe(io_rd_flt_pre_pipe, p.lsuDelayPipelineLen)
io.rd_flt := io_rd_flt_pipe
+ io.queueCapacity := 0.U
+
assert(!ctrl.io.out.valid || PopCount(Cat(ctrl.io.out.bits.fldst, ctrl.io.out.bits.sldst, ctrl.io.out.bits.vldst)) <= 1.U)
assert(!data.io.out.valid || PopCount(Cat(data.io.out.bits.fldst, data.io.out.bits.sldst)) <= 1.U)
}
@@ -953,6 +956,7 @@
io.storeCount := 0.U
val opQueue = Module(new Queue(new LsuUOp(p), 4))
+ io.queueCapacity := opQueue.entries.U - opQueue.io.count
// Flush state
// DispatchV2 will only flush on first slot, when LSU is inactive.
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 2a7d1ad..23a2048 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -120,6 +120,7 @@
dispatch.io.halted := csr.io.halted || csr.io.wfi || csr.io.dm.map(_.debug_mode).getOrElse(false.B)
dispatch.io.mactive := io.vcore.map(_.mactive).getOrElse(false.B)
dispatch.io.lsuActive := lsu.io.active
+ dispatch.io.lsuQueueCapacity := lsu.io.queueCapacity
dispatch.io.scoreboard.comb := regfile.io.scoreboard.comb
dispatch.io.scoreboard.regd := regfile.io.scoreboard.regd
dispatch.io.branchTaken := branchTaken
@@ -422,6 +423,7 @@
dispatch.io.rvv.get <> io.rvvcore.get.inst
dispatch.io.rvvState.get := io.rvvcore.get.configState
dispatch.io.rvvIdle.get := io.rvvcore.get.rvv_idle
+ dispatch.io.rvvQueueCapacity.get := io.rvvcore.get.queue_capacity
// Register inputs
io.rvvcore.get.rs := regfile.io.readData
diff --git a/hdl/verilog/rvv/design/RvvCore.sv b/hdl/verilog/rvv/design/RvvCore.sv
index d1b9c8b..86416b5 100644
--- a/hdl/verilog/rvv/design/RvvCore.sv
+++ b/hdl/verilog/rvv/design/RvvCore.sv
@@ -76,11 +76,11 @@
output RVVConfigState config_state,
// Idle
- output logic rvv_idle
+ output logic rvv_idle,
+ output logic [$clog2(2*N + 1)-1:0] queue_capacity
);
logic [N-1:0] frontend_cmd_valid;
RVVCmd [N-1:0] frontend_cmd_data;
- logic [$clog2(2*N + 1)-1:0] queue_capacity;
RvvFrontEnd#(.N(N)) frontend(
.clk(clk),
.rstn(rstn),
@@ -97,19 +97,21 @@
.reg_write_data_o(reg_write_data),
.cmd_valid_o(frontend_cmd_valid),
.cmd_data_o(frontend_cmd_data),
- .queue_capacity_i(queue_capacity),
+ .queue_capacity_i(queue_capacity_internal),
+ .queue_capacity_o(queue_capacity),
.config_state_valid(config_state_valid),
.config_state(config_state)
);
// Backpressure from backend fifo
logic [$clog2(`CQ_DEPTH):0] remaining_count_cq2rvs;
+ logic [$clog2(2*N + 1)-1:0] queue_capacity_internal;
// Back-pressure frontend
always_comb begin
if (remaining_count_cq2rvs > 2*N) begin
- queue_capacity = 2*N;
+ queue_capacity_internal = 2*N;
end else begin
- queue_capacity = remaining_count_cq2rvs;
+ queue_capacity_internal = remaining_count_cq2rvs;
end
end
diff --git a/hdl/verilog/rvv/design/RvvFrontEnd.sv b/hdl/verilog/rvv/design/RvvFrontEnd.sv
index c37894b..5ef24c5 100644
--- a/hdl/verilog/rvv/design/RvvFrontEnd.sv
+++ b/hdl/verilog/rvv/design/RvvFrontEnd.sv
@@ -49,6 +49,7 @@
output logic [N-1:0] cmd_valid_o,
output RVVCmd [N-1:0] cmd_data_o,
input logic [CAPACITYBITS-1:0] queue_capacity_i, // Number of elements that can be enqueued
+ output logic [CAPACITYBITS-1:0] queue_capacity_o,
// Config state
output config_state_valid,
@@ -62,8 +63,9 @@
RVVConfigState config_state_q;
// Instructions to assemble into commands
- logic [N-1:0] valid_inst_q;
- RVVInstruction inst_q [N-1:0];
+ logic [N-1:0] valid_inst_q; // If the instruction in this slot is valid
+ count_t valid_inst_count_q; // The sum of valid_inst_q
+ RVVInstruction inst_q [N-1:0]; // The instruction in the slot
// Backpressure
count_t valid_in_psum [N:0];
@@ -85,22 +87,33 @@
assign config_state_valid = config_state_reduction;
assign config_state = config_state_q;
+ logic [CAPACITYBITS-1:0] queue_capacity;
+ assign queue_capacity_o = queue_capacity;
+ always_comb begin
+ queue_capacity = queue_capacity_i - valid_inst_count_q;
+ end
+
logic inst_accepted [N-1:0];
+ count_t valid_inst_count_d;
always_comb begin
for (int i = 0; i < N; i++) begin
- inst_accepted[i] = (valid_in_psum[i] < queue_capacity_i) && inst_valid_i[i];
+ inst_accepted[i] = (valid_in_psum[i] < queue_capacity) && inst_valid_i[i];
inst_ready_o[i] = inst_accepted[i];
end
+ valid_inst_count_d = (valid_in_psum[N] < queue_capacity) ?
+ valid_in_psum[N] : queue_capacity;
end
always_ff @(posedge clk or negedge rstn) begin
if (!rstn) begin
for (int i = 0; i < N; i++) begin
valid_inst_q[i] <= 0;
+ valid_inst_count_q <= 0;
end;
end else begin
for (int i = 0; i < N; i++) begin
valid_inst_q[i] <= inst_accepted[i];
+ valid_inst_count_q <= valid_inst_count_d;
inst_q[i] <= inst_data_i[i];
end
end