blob: c06c7bbca8a1260e51f1b0b7f9550445a563a14e [file] [log] [blame]
"""VectorPipe module."""
import collections
import math
from typing import Any, Dict, Sequence, Optional, Tuple, Union
from buffered_queue import BufferedQueue
import counter
from counter import Counter
import instruction
from instruction import Instruction
import interfaces
import scoreboard
import utilities
class VectorPipe(interfaces.ExecPipeline):
"""Vector pipe model.
This pipeline supports flexible chaining and tailgating.
That is
- The vector register is split into a number of slices.
- Once a vector instruction starts producing result slices,
they are written to the register file in order at one slice per cycle.
- (See ScoreboardVector for more on issuing vector instructions.)
Scalar input registers are read at the start of the instruction (first
cycle of first slice).
Scalar output register are written to at the end of the instruction (last
cycle of the last slice).
"""
def __init__(self, name: str, kind: str, desc: Dict[str, Any], slices: int,
mem_sys,
rf_scoreboards: Dict[str, Union[scoreboard.Preemptive,
scoreboard.VecPreemptive]]
) -> None:
super().__init__(name, kind, desc["issue_queue"], desc["depth"])
# Execution Issue Queues
self._eiq = BufferedQueue(desc.get("eiq_size"))
self._can_skip_eiq = desc["can_skip_eiq"]
# The pipeline
self._slices = slices
self._pipelined = desc["pipelined"]
self._stage = collections.deque([None] * self.depth)
self._inflight_instr = None
self._inflight_next_slice = 0
# The writeback buffer
self._writebackq = BufferedQueue(desc.get("writeback_buff_size"))
# Interface to memory
self._mem = (mem_sys.elements[desc["memory_interface"]]
if "memory_interface" in desc else None)
self._load_stage = desc.get("load_stage")
self._fixed_load_latency = desc.get("fixed_load_latency")
self._stalling_loads = {}
self._store_stage = desc.get("store_stage")
self._fixed_store_latency = desc.get("fixed_store_latency")
self._stalling_stores = {}
self._rf_scoreboards = rf_scoreboards
def eslices(self, instr: Instruction) -> int:
"""The number of slices required to execute `instr`."""
return math.ceil(instr.max_emul() * self._slices)
def slice(self, accesses: Sequence[int], index: int,
eslices: int) -> Tuple[int, int]:
"""The memory access location and size, of a given slice.
Args:
accesses: a sequence of memory locations, one for each vector element.
index: the index of the required slice.
eslices: the total number of slices `accesses` should be split to.
# TODO(sflur): return the access size in bytes, instead of `count`.
Return: (access, count), where access is the first byte of the memory
location that should be accessed, and count is the number of elements
that should be accessed.
"""
alen = len(accesses)
slen = alen // eslices
start = index * slen
slen = min(slen, alen - start)
return (accesses[start], slen)
def reg_read_stall(self, instr: Instruction, s: int) -> bool:
return any(not self._rf_scoreboards[rf].can_read(instr, seq[s])
for rf, seq in self.input_seq_by_type(instr).items()
if len(seq) > s)
def reg_write_stall(self, instr: Instruction, s: int) -> bool:
return any(not self._rf_scoreboards[rf].can_write(instr, seq[s])
for rf, seq in self.output_seq_by_type(instr).items()
if len(seq) > s)
def sb_reg_read(self, instr: Instruction, s: int) -> None:
for rf, seq in self.input_seq_by_type(instr).items():
if len(seq) > s and seq[s]:
self._rf_scoreboards[rf].read(instr, seq[s])
def sb_buff_reg_write(self, instr: Instruction, s: int) -> None:
for rf, seq in self.output_seq_by_type(instr).items():
if len(seq) > s:
self._rf_scoreboards[rf].buff_write(instr, seq[s])
def sb_reg_write(self, instr: Instruction, s: int) -> None:
for rf, seq in self.output_seq_by_type(instr).items():
if len(seq) > s:
self._rf_scoreboards[rf].write(instr, seq[s])
def do_reg_writeback(self) -> None:
if self._writebackq:
instr, s = self._writebackq[0]
if not self.reg_write_stall(instr, s):
self.sb_reg_write(instr, s)
self._writebackq.popleft()
if s + 1 == self.eslices(instr):
self.retired_instrs.append(instr)
def stall(self, cntr: Counter) -> bool:
# Check if last stage needs to do reg writes, and the writeback buffer
# is full.
if (self._stage[-1] and any(
self._stage[-1][1] < len(seq) and seq[self._stage[-1][1]]
for _, seq in
self.output_seq_by_type(self._stage[-1][0]).items()) and
self._writebackq.is_buffer_full()):
return True
# Check if memory accesses are waiting for reply.
if (any(self._stalling_loads.values()) or
any(self._stalling_stores.values())):
cntr.vector_load_store_stall += 1
return True
return False
def do_load(self) -> None:
if self._load_stage is None:
return
if (self._stage[self._load_stage] and
self._stage[self._load_stage][0].loads):
instr, s = self._stage[self._load_stage]
load, _size = self.slice(instr.loads, s, self.eslices(instr))
if (instr, s, load) not in self._stalling_loads:
# TODO(sflur): pass size to issue_load
self._mem.issue_load((instr, s), load)
self._stalling_loads[(instr, s, load)] = None
if (self._stage[self._load_stage + self._fixed_load_latency] and
self._stage[self._load_stage +
self._fixed_load_latency][0].loads):
instr, s = self._stage[self._load_stage + self._fixed_load_latency]
load, _ = self.slice(instr.loads, s, self.eslices(instr))
if self._stalling_loads[(instr, s, load)] is None:
self._stalling_loads[(instr, s, load)] = True
for load in self._mem.take_load_replys((instr, s)):
self._stalling_loads[(instr, s, load)] = False
def do_store(self) -> None:
if self._store_stage is None:
return
if self._stage[self._store_stage] and self._stage[
self._store_stage][0].stores:
instr, s = self._stage[self._store_stage]
store, _size = self.slice(instr.stores, s, self.eslices(instr))
if (instr, s, store) not in self._stalling_stores:
# TODO(sflur): pass size to issue_store
self._mem.issue_store((instr, s), store)
self._stalling_stores[(instr, s, store)] = None
if (self._stage[self._store_stage + self._fixed_store_latency] and
self._stage[self._store_stage +
self._fixed_store_latency][0].stores):
instr, s = self._stage[self._store_stage +
self._fixed_store_latency]
store, _ = self.slice(instr.stores, s, self.eslices(instr))
if self._stalling_stores[(instr, s, store)] is None:
self._stalling_stores[(instr, s, store)] = True
for store in self._mem.take_store_replys((instr, s)):
self._stalling_stores[(instr, s, store)] = False
# Implements interfaces.ExecPipeline
def reset(self, cntr: Counter) -> None:
super().reset(cntr)
# TODO(sflur): implement proper reset
cntr.utilizations[f"{self.name}.eiq"] = counter.Utilization(
self._eiq.size)
cntr.utilizations[f"{self.name}.pipe"] = counter.Utilization(
len(self._stage))
cntr.utilizations[f"{self.name}.wbq"] = counter.Utilization(
self._writebackq.size)
# Implements interfaces.ExecPipeline
def tick(self, cntr: Counter) -> None:
super().tick(cntr)
self.retired_instrs.clear()
self.do_reg_writeback()
if not self.stall(cntr):
# Cleanup self.stalling_loads
if (self._load_stage is not None and
self._stage[self._load_stage + self._fixed_load_latency] and
self._stage[self._load_stage +
self._fixed_load_latency][0].loads):
instr, s = self._stage[self._load_stage +
self._fixed_load_latency]
load, _ = self.slice(instr.loads, s, self.eslices(instr))
# The assertion holds because self.stall() above is True.
assert not self._stalling_loads.get((instr, s, load), False)
del self._stalling_loads[(instr, s, load)]
# Cleanup self.stalling_stores
if (self._store_stage is not None and
self._stage[self._store_stage +
self._fixed_store_latency] and
self._stage[self._store_stage +
self._fixed_store_latency][0].stores):
instr, s = self._stage[self._store_stage +
self._fixed_store_latency]
store, _ = self.slice(instr.stores, s, self.eslices(instr))
# The assertion holds because self.stall() above is True.
assert not self._stalling_stores.get((instr, s, store), False)
del self._stalling_stores[(instr, s, store)]
# Shift stages
st = self._stage.pop()
if st:
instr, s = st
if any(
len(seq) > s and seq[s]
for _, seq in self.output_seq_by_type(instr).items()):
self._writebackq.buffer((instr, s))
cntr.utilizations[f"{self.name}.wbq"].count += 1
self.sb_buff_reg_write(instr, s)
elif s + 1 == self.eslices(instr):
self.retired_instrs.append(instr)
# Issue the next slice into the piprline.
if (self._inflight_instr and not self.reg_read_stall(
self._inflight_instr, self._inflight_next_slice)):
self.sb_reg_read(self._inflight_instr,
self._inflight_next_slice)
self._stage.appendleft(
(self._inflight_instr, self._inflight_next_slice))
cntr.utilizations[f"{self.name}.pipe"].count += 1
self._inflight_next_slice += 1
if self._inflight_next_slice == self.eslices(
self._inflight_instr):
self._inflight_instr = None
else:
self._stage.appendleft(None)
self.do_load()
self.do_store()
# Try to issue each of the instructions in `eiq`.
if self.is_ready():
for _ in range(len(self._eiq)):
instr = self._eiq.popleft()
if self.try_issue(instr, cntr):
break
self._eiq.append(instr)
# Implements interfaces.ExecPipeline
def tock(self, cntr: Counter) -> None:
super().tock(cntr)
self.retired_instrs.clear()
cntr.utilizations[f"{self.name}.pipe"].occupied += len(
list(1 for i in self._stage if i))
self._eiq.flush()
cntr.utilizations[f"{self.name}.eiq"].occupied += len(self._eiq)
self._writebackq.flush()
cntr.utilizations[f"{self.name}.wbq"].occupied += len(self._writebackq)
# Implements interfaces.ExecPipeline
def pending(self) -> int:
eiq_count = len(list(self._eiq.chain()))
pipe_count = len(list(1 for i in self._stage if i))
wbq_count = len(list(self._writebackq.chain()))
return eiq_count + pipe_count + wbq_count
# Implements interfaces.ExecPipeline
def try_dispatch(self, instr: Instruction, cntr: Counter) -> bool:
if self._eiq.is_buffer_full():
return False
inputs = self.input_seq_by_type(instr)
outputs = self.output_seq_by_type(instr)
for rf in inputs.keys() | outputs.keys():
reads = utilities.flatten(inputs.get(rf, []))
writes = utilities.flatten(outputs.get(rf, []))
self._rf_scoreboards[rf].insert_accesses(instr, reg_reads=reads,
reg_writes=writes)
if not (self._can_skip_eiq and self.is_ready() and
self.try_issue(instr, cntr)):
self._eiq.buffer(instr)
cntr.utilizations[f"{self.name}.eiq"].count += 1
if instr.loads or instr.stores:
cntr.vector_load_store += 1
return True
def is_ready(self) -> bool:
"""Check if the pipe is ready to accept a new instruction."""
if self._pipelined:
return self._inflight_instr is None and self._stage[0] is None
return self._inflight_instr is None and all(
s is None for s in self._stage)
def try_issue(self, instr: Instruction, cntr: Counter) -> bool:
"""Issue an instruction."""
if not all(sb.can_issue(instr) for sb in self._rf_scoreboards.values()):
return False
if self.reg_read_stall(instr, 0):
return False
assert self._stage[0] is None
self._stage[0] = (instr, 0)
cntr.utilizations[f"{self.name}.pipe"].count += 1
assert self._inflight_instr is None
if 1 < self.eslices(instr):
self._inflight_instr = instr
self._inflight_next_slice = 1
for sb in self._rf_scoreboards.values():
sb.issue(instr)
self.sb_reg_read(instr, 0)
return True
def vec_reg_seq(self, reg: str, input_reg: bool, emul: Union[int, float],
max_emul: Union[int, float]) -> Sequence[Optional[str]]:
base = int(reg[1:])
if emul < 1:
seq = [f"{reg}.{s}" for s in range(math.ceil(emul * self._slices))]
else:
emul = int(emul)
seq = [
f"{reg[0]}{base + g}.{s}" for g in range(emul)
for s in range(self._slices)
]
if (emul == max_emul or (emul < 1 and self._slices < 1 / emul)):
return seq
assert int(max_emul / emul) == 2
# Interleave Nones with `seq`
if input_reg:
seq = zip(seq, [None] * len(seq))
else:
seq = zip([None] * len(seq), seq)
return utilities.flatten(seq)
def input_seq(self, instr: Instruction, reg: str) -> Sequence[str]:
if instruction.is_vector_register(reg):
assert instr.lmul is not None
# TODO(sflur): anymore cases of input widening?
if ((instr.mnemonic.endswith(".wv") or
instr.mnemonic.endswith(".wx") or
instr.mnemonic.endswith(".wf") or
instr.mnemonic.endswith(".wi")) and instr.operands[1] == reg):
emul = 2 * instr.lmul
else:
emul = instr.lmul
return self.vec_reg_seq(reg, True, emul, instr.max_emul())
return [reg]
def output_seq(self, instr: Instruction, reg: str) -> Sequence[str]:
if instruction.is_vector_register(reg):
assert instr.lmul is not None
# TODO(sflur): anymore cases of output widening?
if ((instr.mnemonic.startswith("vw") or
instr.mnemonic.startswith("vfw")) and
instr.operands[0] == reg):
emul = 2 * instr.lmul
else:
emul = instr.lmul
return self.vec_reg_seq(reg, False, emul, instr.max_emul())
res = [None] * self.eslices(instr)
res[-1] = reg
return res
def input_seq_by_type(
self, instr: Instruction) -> Dict[str, Sequence[Sequence[str]]]:
"""Compute a map from register-files to sequences of input register
sets.
A register file is mapped to a sequence of sets, where set i is the set
of registers that will be read from by slice i.
"""
res = {}
for ty, regs in instr.inputs_by_type().items():
seq = [collections.deque() for _ in range(self.eslices(instr))]
for reg in regs:
for i, r in enumerate(self.input_seq(instr, reg)):
if r:
seq[i].append(r)
res[ty] = seq
return res
def output_seq_by_type(
self, instr: Instruction) -> Dict[str, Sequence[Sequence[str]]]:
"""Compute a map from register-files to sequences of output register
sets.
A register file is mapped to a sequence of sets, where set i is the set
of registers that will be written to by slice i.
"""
res = {}
for ty, regs in instr.outputs_by_type().items():
seq = [collections.deque() for _ in range(self.eslices(instr))]
for reg in regs:
for i, r in enumerate(self.output_seq(instr, reg)):
if r:
seq[i].append(r)
res[ty] = seq
return res
# Implements interfaces.ExecPipeline
def print_state_detailed(self, file) -> None:
eiq_str = ", ".join(str(i) for i in reversed(list(self._eiq.chain())))
stages = ", ".join(f"{i[0]} ({i[1]})" if i else "-"
for i in self._stage)
wbq_str = ", ".join(f"{i[0]} ({i[1]})"
for i in reversed(list(self._writebackq.chain())))
pipe_str = (f"{eiq_str if eiq_str else '-'}"
f" > {stages}"
f" > {wbq_str if wbq_str else '-'}")
print(f"[{self.name}] {pipe_str}", file=file)
# Implements interfaces.ExecPipeline
def get_state_three_valued_header(self) -> Sequence[str]:
return ["eiq", self.kind, "wbq"]
# Implements interfaces.ExecPipeline
def get_state_three_valued(self, vals: Sequence[str]) -> Sequence[str]:
if all(self._stage):
# Full
pipe_str = vals[2]
elif any(self._stage):
# Partial
pipe_str = vals[1]
else:
# Empty
pipe_str = vals[0]
return [self._eiq.pp_three_valued(vals),
pipe_str,
self._writebackq.pp_three_valued(vals)]