execution unit: model of an execution unit

This includes EIQ, a pipeline (scalar or vector) and a writeback queue.

The commit also includes scoreboard models for scalar register files, and
vector register files.

Change-Id: I6f4f116dd06d45ab76182dc09b35ce6aa69afc0d
diff --git a/exec_unit.py b/exec_unit.py
new file mode 100644
index 0000000..2d4addc
--- /dev/null
+++ b/exec_unit.py
@@ -0,0 +1,151 @@
+import collections
+import sys
+from typing import Any, Dict, Sequence, Union
+
+from counter import Counter
+from instruction import Instruction
+import interfaces
+import scoreboard
+
+class ExecUnit(interfaces.ExecUnit):
+    """Execution unit model."""
+
+    def __init__(
+        self, config: Dict[str, Any], pipe_map: Dict[str, str],
+        rf_scoreboards: Dict[str, Union[scoreboard.Preemptive,
+                                        scoreboard.VecPreemptive]]
+    ):
+        super().__init__("EX")
+
+        self._branch_prediction = config["branch_prediction"]
+
+        self._fetch_unit = None
+        self._sched_unit = None
+
+        self._pipe_map = pipe_map
+
+        # State
+        self._rf_scoreboards = rf_scoreboards
+        self._pipes = {}
+        self._retired_instructions = collections.deque()
+
+    def add_pipe(self, kind: str,
+                 pipes: Sequence[interfaces.ExecPipeline]) -> None:
+        assert pipes
+        assert kind not in self._pipes
+        self._pipes[kind] = pipes
+
+    def connect(self, fetch_unit: interfaces.FetchUnit,
+                sched_unit: interfaces.SchedUnit) -> None:
+        self._fetch_unit = fetch_unit
+        self._sched_unit = sched_unit
+
+    # Implements interfaces.ExecUnit
+    def pending(self) -> int:
+        return sum(p.pending() for ps in self._pipes.values() for p in ps)
+
+    # Implements interfaces.ExecUnit
+    def get_issue_queue_id(self, instr: Instruction) -> str:
+        kind = self.get_functional_unit(instr)
+        return self._pipes[kind][0].issue_queue_id
+
+    def get_functional_unit(self, instr: Instruction) -> str:
+        """Return the functional unit kind the instruction will execute in."""
+        try:
+            return self._pipe_map[instr.mnemonic]
+        except KeyError:
+            self.logger.error("unknown pipe for instruction '%s'",
+                              instr.mnemonic)
+            sys.exit(1)
+
+    # Implements interfaces.ExecUnit
+    def reset(self, cntr: Counter) -> None:
+        super().reset(cntr)
+        # TODO(sflur): implement proper reset
+        for ps in self._pipes.values():
+            for p in ps:
+                p.reset(cntr)
+
+    # Implements interfaces.ExecUnit
+    def tick(self, cntr: Counter) -> None:
+        """Move instructions from dispatch queues, in sched_unit, to functional
+        units.
+
+        Instructions move in lockstep when possible. To achieve lockstep we
+        process the elements counter to instruction flow direction.
+        """
+        super().tick(cntr)
+
+        self._retired_instructions.clear()
+
+        for sb in self._rf_scoreboards.values():
+            sb.tick(cntr)
+
+        for ps in self._pipes.values():
+            for p in ps:
+                p.tick(cntr)
+                self._retired_instructions.extend(p.retired_instrs)
+
+        if self._branch_prediction == "none":
+            for instr in self._retired_instructions:
+                if instr.is_branch:
+                    self._sched_unit.branch_resolved()
+                    self._fetch_unit.branch_resolved()
+                    break
+
+        for dq in self._sched_unit.queues:
+            while dq:
+                if self.dispatch_instruction(dq[0], cntr):
+                    dq.popleft()
+                else:
+                    break
+
+        # Update retired instruction count.
+        cntr.retired_instruction_count += len(self._retired_instructions)
+
+    # Implements interfaces.ExecUnit
+    def tock(self, cntr: Counter) -> None:
+        super().tock(cntr)
+
+
+        self._retired_instructions.clear()
+
+        for sb in self._rf_scoreboards.values():
+            sb.tock(cntr)
+
+        for ps in self._pipes.values():
+            for p in ps:
+                p.tock(cntr)
+                self._retired_instructions.extend(p.retired_instrs)
+
+        # Update retired instruction count.
+        cntr.retired_instruction_count += len(self._retired_instructions)
+
+    def dispatch_instruction(self, instr: Instruction, cntr: Counter):
+        # TODO(sflur): use other policies to choose pipe, instead of the first
+        # free one.
+        kind = self.get_functional_unit(instr)
+        for pipe in self._pipes[kind]:
+            if pipe.try_dispatch(instr, cntr):
+                return True
+
+        return False
+
+    # Implements interfaces.ExecUnit
+    def print_state_detailed(self, file) -> None:
+        for pipes in self._pipes.values():
+            for pipe in pipes:
+                pipe.print_state_detailed(file)
+
+        print("[re] " + ", ".join(str(i) for i in self._retired_instructions),
+              file=file)
+
+    # Implements interfaces.ExecUnit
+    def get_state_three_valued_header(self) -> Sequence[str]:
+        return [pipe.get_state_three_valued_header()
+                for pipes in self._pipes.values() for pipe in pipes]
+
+    # Implements interfaces.ExecUnit
+    def get_state_three_valued(self, vals: Sequence[str]) -> Sequence[str]:
+        return [pipe.get_state_three_valued(vals)
+                for pipes in self._pipes.values() for pipe in pipes]
diff --git a/scalar_pipe.py b/scalar_pipe.py
new file mode 100644
index 0000000..1cb3586
--- /dev/null
+++ b/scalar_pipe.py
@@ -0,0 +1,300 @@
+"""ScalarPipe module."""
+
+import collections
+from typing import Any, Dict, Sequence, Union
+
+from buffered_queue import BufferedQueue
+import counter
+from counter import Counter
+from instruction import Instruction
+import interfaces
+import scoreboard
+
+
+class ScalarPipe(interfaces.ExecPipeline):
+    def __init__(self, name:str, kind: str, desc: Dict[str, Any], mem_sys,
+                 rf_scoreboards: Dict[str, Union[scoreboard.Preemptive,
+                                                 scoreboard.VecPreemptive]]
+                 ) -> None:
+        super().__init__(name, kind, desc["issue_queue"], desc["depth"])
+
+        # Execution Issue Queues
+        self._eiq = BufferedQueue(desc.get("eiq_size"))
+        self._can_skip_eiq = desc["can_skip_eiq"]
+
+        # The pipeline
+        self._pipelined = desc["pipelined"]
+        self._stage = collections.deque([None] * self.depth)
+
+        # The writeback buffer
+        self._writebackq = BufferedQueue(desc.get("writeback_buff_size"))
+
+        # Interface to memory
+        self._mem = (mem_sys.elements[desc["memory_interface"]]
+                     if "memory_interface" in desc else None)
+
+        self._load_stage = desc.get("load_stage")
+        self._fixed_load_latency = desc.get("fixed_load_latency")
+        self._stalling_loads = {}
+
+        self._store_stage = desc.get("store_stage")
+        self._fixed_store_latency = desc.get("fixed_store_latency")
+        self._stalling_stores = {}
+
+        self._rf_scoreboards = rf_scoreboards
+
+    def reg_read_stall(self, instr: Instruction) -> bool:
+        return any(not self._rf_scoreboards[rf].can_read(instr, regs)
+                   for rf, regs in instr.inputs_by_type().items())
+
+    def reg_write_stall(self, instr: Instruction) -> bool:
+        return any(not self._rf_scoreboards[rf].can_write(instr, regs)
+                   for rf, regs in instr.outputs_by_type().items())
+
+    def sb_reg_read(self, instr: Instruction) -> None:
+        for rf, regs in instr.inputs_by_type().items():
+            self._rf_scoreboards[rf].read(instr, regs)
+
+    def sb_buff_reg_write(self, instr: Instruction) -> None:
+        for rf, regs in instr.outputs_by_type().items():
+            self._rf_scoreboards[rf].buff_write(instr, regs)
+
+    def sb_reg_write(self, instr: Instruction) -> None:
+        for rf, regs in instr.outputs_by_type().items():
+            self._rf_scoreboards[rf].write(instr, regs)
+
+    def do_reg_writeback(self) -> None:
+        if self._writebackq:
+            if not self.reg_write_stall(self._writebackq[0]):
+                instr = self._writebackq.popleft()
+                self.sb_reg_write(instr)
+                self.retired_instrs.append(instr)
+
+    def stall(self, cntr: Counter) -> bool:
+        # Check if last stage needs to do reg writes, and the writeback buffer
+        # is full.
+        if (self._stage[-1] and self._stage[-1].outputs_by_type() and
+            self._writebackq.is_buffer_full()):
+            return True
+
+        # Check if memory accesses are waiting for reply.
+        if (any(self._stalling_loads.values()) or
+            any(self._stalling_stores.values())):
+            cntr.scalar_load_store_stall += 1
+            return True
+
+        return False
+
+    def do_load(self) -> None:
+        if self._load_stage is None:
+            return
+
+        if self._stage[self._load_stage]:
+            inst = self._stage[self._load_stage]
+            # TODO(sflur): handle multiple loads?
+            assert len(inst.loads) <= 1
+            for load in inst.loads:
+                if (inst, load) not in self._stalling_loads:
+                    self._mem.issue_load(inst, load)
+                    self._stalling_loads[(inst, load)] = None
+
+        if self._stage[self._load_stage + self._fixed_load_latency]:
+            inst = self._stage[self._load_stage + self._fixed_load_latency]
+            for load in inst.loads:
+                if self._stalling_loads[(inst, load)] is None:
+                    self._stalling_loads[(inst, load)] = True
+            for load in self._mem.take_load_replys(inst):
+                self._stalling_loads[(inst, load)] = False
+
+    def do_store(self) -> None:
+        if self._store_stage is None:
+            return
+
+        if self._stage[self._store_stage]:
+            inst = self._stage[self._store_stage]
+            # TODO(sflur): handle multiple stores?
+            assert len(inst.stores) <= 1
+            for store in inst.stores:
+                if (inst, store) not in self._stalling_stores:
+                    self._mem.issue_store(inst, store)
+                    self._stalling_stores[(inst, store)] = None
+
+        if self._stage[self._store_stage + self._fixed_store_latency]:
+            inst = self._stage[self._store_stage + self._fixed_store_latency]
+            for store in inst.stores:
+                if self._stalling_stores[(inst, store)] is None:
+                    self._stalling_stores[(inst, store)] = True
+            for store in self._mem.take_store_replys(inst):
+                self._stalling_stores[(inst, store)] = False
+
+    # Implements interfaces.ExecPipeline
+    def reset(self, cntr: Counter) -> None:
+        super().reset(cntr)
+        # TODO(sflur): implement proper reset
+        cntr.utilizations[f"{self.name}.eiq"] = counter.Utilization(
+            self._eiq.size)
+        cntr.utilizations[f"{self.name}.pipe"] = counter.Utilization(
+            len(self._stage))
+        cntr.utilizations[f"{self.name}.wbq"] = counter.Utilization(
+            self._writebackq.size)
+
+    # Implements interfaces.ExecPipeline
+    def tick(self, cntr: Counter) -> None:
+        """Move instructions from EIQ to pipeline, to WBQ, to RF.
+
+        Instructions move in lockstep when possible. To achieve lockstep we
+        process the elements counter to instruction flow direction.
+        """
+        super().tick(cntr)
+
+        self.retired_instrs.clear()
+
+        self.do_reg_writeback()
+
+        if not self.stall(cntr):
+            # Cleanup self._stalling_loads
+            if (self._load_stage is not None and
+                    self._stage[self._load_stage + self._fixed_load_latency]):
+                inst = self._stage[self._load_stage + self._fixed_load_latency]
+                for load in inst.loads:
+                    # The assertion holds because self.stall() above is True.
+                    assert not self._stalling_loads.get((inst, load), False)
+                    del self._stalling_loads[(inst, load)]
+
+            # Cleanup self._stalling_stores
+            if (self._store_stage is not None and
+                    self._stage[self._store_stage + self._fixed_store_latency]):
+                inst = self._stage[self._store_stage +
+                                   self._fixed_store_latency]
+                for store in inst.stores:
+                    # The assertion holds because self.stall() above is True.
+                    del self._stalling_stores[(inst, store)]
+
+            # Shift stages
+            instr = self._stage.pop()
+            if instr:
+                if instr.outputs_by_type().items():
+                    self._writebackq.buffer(instr)
+                    cntr.utilizations[f"{self.name}.wbq"].count += 1
+                    self.sb_buff_reg_write(instr)
+                else:
+                    self.retired_instrs.append(instr)
+            self._stage.appendleft(None)
+
+        self.do_load()
+        self.do_store()
+
+        # Try to issue instructions from eiq to pipeline, until one succeeds.
+        if self.is_ready():
+            for _ in range(len(self._eiq)):
+                instr = self._eiq.popleft()
+                if self.try_issue(instr, cntr):
+                    break
+
+                self._eiq.append(instr)
+
+    # Implements interfaces.ExecPipeline
+    def tock(self, cntr: Counter) -> None:
+        super().tock(cntr)
+
+        self.retired_instrs.clear()
+
+        cntr.utilizations[f"{self.name}.pipe"].occupied += len(
+            list(1 for i in self._stage if i))
+
+        self._eiq.flush()
+        cntr.utilizations[f"{self.name}.eiq"].occupied += len(self._eiq)
+
+        self._writebackq.flush()
+        cntr.utilizations[f"{self.name}.wbq"].occupied += len(self._writebackq)
+
+    # Implements interfaces.ExecPipeline
+    def pending(self) -> int:
+        eiq_count = len(list(self._eiq.chain()))
+        pipe_count = len(list(1 for i in self._stage if i))
+        wbq_count = len(list(self._writebackq.chain()))
+        return eiq_count + pipe_count + wbq_count
+
+    # Implements interfaces.ExecPipeline
+    def try_dispatch(self, instr: Instruction, cntr: Counter) -> bool:
+        if self._eiq.is_buffer_full():
+            return False
+
+        inputs = instr.inputs_by_type()
+        outputs = instr.outputs_by_type()
+
+        for rf in inputs.keys() | outputs.keys():
+            reads = inputs.get(rf, [])
+            writes = outputs.get(rf, [])
+            self._rf_scoreboards[rf].insert_accesses(instr, reg_reads=reads,
+                                                     reg_writes=writes)
+
+        if not (self._can_skip_eiq and self.is_ready() and
+                self.try_issue(instr, cntr)):
+            self._eiq.buffer(instr)
+            cntr.utilizations[f"{self.name}.eiq"].count += 1
+
+        if instr.loads or instr.stores:
+            cntr.scalar_load_store += 1
+
+        return True
+
+    def is_ready(self) -> bool:
+        """Check if the pipe is ready to accept a new instruction."""
+        if self._pipelined:
+            return self._stage[0] is None
+
+        return all(s is None for s in self._stage)
+
+    def try_issue(self, instr: Instruction, cntr: Counter) -> bool:
+        """Issue an instruction."""
+
+        if not all(sb.can_issue(instr) for sb in self._rf_scoreboards.values()):
+            return False
+
+        if self.reg_read_stall(instr):
+            return False
+
+        assert self._stage[0] is None
+        self._stage[0] = instr
+        cntr.utilizations[f"{self.name}.pipe"].count += 1
+
+        for sb in self._rf_scoreboards.values():
+            sb.issue(instr)
+
+        self.sb_reg_read(instr)
+
+        return True
+
+    # Implements interfaces.ExecPipeline
+    def print_state_detailed(self, file) -> None:
+        eiq_str = ", ".join(str(i) for i in reversed(list(self._eiq.chain())))
+        stages = ", ".join(str(i) if i else "-" for i in self._stage)
+        wbq_str = ", ".join(
+            str(i) for i in reversed(list(self._writebackq.chain())))
+
+        pipe_str = (f"{eiq_str if eiq_str else '-'}"
+                    f" > {stages}"
+                    f" > {wbq_str if wbq_str else '-'}")
+
+        print(f"[{self.name}] {pipe_str}", file=file)
+
+    # Implements interfaces.ExecPipeline
+    def get_state_three_valued_header(self) -> Sequence[str]:
+        return ["eiq", self.kind, "wbq"]
+
+    # Implements interfaces.ExecPipeline
+    def get_state_three_valued(self, vals: Sequence[str]) -> Sequence[str]:
+        if all(self._stage):
+            # Full
+            pipe_str = vals[2]
+        elif any(self._stage):
+            # Partial
+            pipe_str = vals[1]
+        else:
+            # Empty
+            pipe_str = vals[0]
+
+        return [self._eiq.pp_three_valued(vals),
+                pipe_str,
+                self._writebackq.pp_three_valued(vals)]
diff --git a/scoreboard.py b/scoreboard.py
new file mode 100644
index 0000000..95351ac
--- /dev/null
+++ b/scoreboard.py
@@ -0,0 +1,385 @@
+"""scoreboard module."""
+
+import collections
+import sys
+from typing import Any, Dict, Sequence, Tuple
+
+from counter import Counter
+from instruction import Instruction
+import interfaces
+
+
+class Preemptive(interfaces.Scoreboard):
+    """Scoreboard that stalls functional units."""
+
+    def __init__(self, uid: str, desc: Dict[str, Any]) -> None:
+        super().__init__(uid)
+
+        # `None` means unrestricted
+        self.read_ports = desc.get("read_ports")
+        self.dedicated_read_ports = set(desc.get("dedicated_read_ports", []))
+
+        # `None` means unrestricted
+        self.write_ports = desc.get("write_ports")
+        self.dedicated_write_ports = set(desc.get("dedicated_write_ports", []))
+
+        # `self.rw_deps[instr][reg]` is the instruction from which `instr`
+        # reads `reg`'s value, if that instruction is still in-flight, and
+        # `None` otherwise.
+        self.rw_deps = {}
+
+        # `self.ww_deps[instr][reg]` is the instruction that writes to `reg`
+        # just before `instr` writes to it.
+        self.ww_deps = {}
+
+        # `self.wr_deps[instr][reg]` is the set of instructions that must do
+        # their reads from `reg` before `instr` does its write to `reg`.
+        self.wr_deps = collections.defaultdict(dict)
+
+        # `self.writes[reg]` is the last instruction, so far, that intends to
+        # write to `reg`, if that instruction is in-flight, and `None`
+        # otherwise.
+        self.writes = collections.defaultdict(lambda: None)
+
+        # `self.reads[reg]` is the set of instructions that follow
+        # `self.writes[reg]`, and read from `reg`.
+        self.reads = collections.defaultdict(set)
+
+        # The set of instructions that have been issued to a functional unit.
+        # This is a coarse way of preventing deadlocks.
+        self.issued = set()
+
+        # `self.write_buff[instr]` is the set of registers for which `instr`
+        # has already computed a write value that can be used in a bypass.
+        self.write_buff = collections.defaultdict(set)
+
+        # The number of register reads/writes done so far in the current tick.
+        self.used_read_ports = 0
+        self.used_write_ports = 0
+
+    def dump(self, file=sys.stdout) -> None:
+        print(f"-- Scoreboard {self.name}: --", file=file)
+
+        print(f"read ports: {self.read_ports}", file=file)
+        print(f"dedicated read ports: {self.dedicated_read_ports}", file=file)
+        print(f"write ports: {self.write_ports}", file=file)
+        print(f"dedicated write ports: {self.dedicated_write_ports}", file=file)
+
+        print(f"issued instructions: {', '.join(str(i) for i in self.issued)}",
+              file=file)
+
+        def pp_instr(i) -> str:
+            if i is None:
+                return "None"
+
+            if isinstance(i, Instruction):
+                return str(i)
+
+            if isinstance(i, Tuple):
+                return f"{pp_instr(i[0])} ({i[1]})"
+
+            return "???"
+
+        for i, deps in self.rw_deps.items():
+            print(f"rw {pp_instr(i)}: " +
+                  ", ".join(f"({r}: {pp_instr(d)})" for r, d in deps.items()),
+                  file=file)
+
+        for i, deps in self.ww_deps.items():
+            print(f"ww {pp_instr(i)}: " +
+                  ", ".join(f"({r}: {pp_instr(d)})" for r, d in deps.items()),
+                  file=file)
+
+        for i, deps in self.wr_deps.items():
+            print(f"wr {pp_instr(i)}: " +
+                  ", ".join(f"({r}: " + "; ".join(pp_instr(d)
+                                                  for d in ds) + ")"
+                            for r, ds in deps.items()),
+                  file=file)
+
+    # Implements interfaces.Scoreboard
+    def insert_accesses(self, instr: Instruction, *,
+                        # keyword-only args:
+                        reg_reads: Sequence[str],
+                        reg_writes: Sequence[str]) -> None:
+        for reg in reg_reads:
+            # We assume instructions never read their own writes
+            assert self.writes[reg] != instr
+
+            self.rw_deps.setdefault(
+                instr, {}
+            )[reg] = self.writes[reg]
+            self.reads[reg].add(instr)
+
+        for reg in reg_writes:
+            # Can instructions write twice to the same reg?
+            assert self.writes[reg] != instr
+
+            self.ww_deps.setdefault(
+                instr, {}
+            )[reg] = self.writes[reg]
+            self.wr_deps.setdefault(instr, {}).setdefault(
+                reg, set()).update(self.reads[reg] - {instr})
+
+            self.writes[reg] = instr
+            self.reads[reg].clear()
+
+    def read_port_regs(self, instr, regs):
+        """Return the regs that need to use a non-dedicated read port."""
+        return [
+            r for r in regs if
+            (r not in self.dedicated_read_ports
+             # rw_deps which are not None will be read from
+             # the write-buffer.
+             # TODO(sflur): what are the restrictions on the
+             # write-buffer?
+             and self.rw_deps[instr][r] is None)
+        ]
+
+    def check_read_ports(self, instr, regs) -> bool:
+        if self.read_ports is None:
+            return True
+
+        return (self.used_read_ports + len(self.read_port_regs(instr, regs)) <=
+                self.read_ports)
+
+    # Implements interfaces.Scoreboard
+    def can_read(self, instr, regs) -> bool:
+        if not self.check_read_ports(instr, regs):
+            return False
+
+        for reg in regs:
+            dep = self.rw_deps[instr][reg]
+            if dep and reg not in self.write_buff[dep]:
+                return False
+        return True
+
+    def write_port_regs(self, instr, regs):
+        """Return the regs that need to use a non-dedicated write port."""
+        # `instr` is not used here, but it is used in the Vec case below.
+        del instr
+        return [r for r in regs if r not in self.dedicated_write_ports]
+
+    def check_write_ports(self, instr, regs) -> bool:
+        if self.write_ports is None:
+            return True
+
+        return (self.used_write_ports + len(self.write_port_regs(instr, regs))
+                <= self.write_ports)
+
+    # Implements interfaces.Scoreboard
+    def can_write(self, instr, regs) -> bool:
+        if not self.check_write_ports(instr, regs):
+            return False
+
+        return not (any(self.ww_deps[instr][reg] for reg in regs) or
+                    any(self.wr_deps[instr][reg] for reg in regs))
+
+    def update_used_read_ports(self, instr, regs) -> None:
+        self.used_read_ports += len(self.read_port_regs(instr, regs))
+
+    # Implements interfaces.Scoreboard
+    def read(self, instr, regs) -> None:
+        self.update_used_read_ports(instr, regs)
+
+        for reg in regs:
+            # TODO(sflur): In RVV vec reg groups must be a multiple of the
+            # LMUL. Hence, `vadd.vv v0, v1, v2` with LMUL=2 is not a valid
+            # instruction (it's actually "reserved"), becasue v1 is not
+            # multiple of 2. If we want to support architectures where this is
+            # allowed, consider a valid instruction like `vadd.vv v0, v1, v2`
+            # with LMUL=2, and 2 slices uArch. Slice v2.0 is read twice, first
+            # as part of the group starting from v2, and second as part of the
+            # group starting from v1. The `del` below will be executed for the
+            # first read and then the second read will fail in `can_read` where
+            # we assume it's still in the map.
+            del self.rw_deps[instr][reg]
+
+            for rdeps in self.wr_deps.values():
+                rdeps.get(reg, set()).discard(instr)
+
+            self.reads[reg].discard(instr)
+
+        if not any(self.rw_deps[instr]):
+            del self.rw_deps[instr]
+            if instr not in self.ww_deps:
+                self.issued.remove(instr)
+
+    # Implements interfaces.Scoreboard
+    def buff_write(self, instr, regs) -> None:
+        self.write_buff[instr].update(regs)
+
+    def update_used_write_ports(self, instr, regs) -> None:
+        self.used_write_ports += len(self.write_port_regs(instr, regs))
+
+    # Implements interfaces.Scoreboard
+    def write(self, instr, regs) -> None:
+        self.update_used_write_ports(instr, regs)
+
+        for reg in regs:
+            del self.ww_deps[instr][reg]
+            del self.wr_deps[instr][reg]
+
+            for rdeps in self.rw_deps.values():
+                if rdeps.get(reg) == instr:
+                    rdeps[reg] = None
+
+            for rdeps in self.ww_deps.values():
+                if rdeps.get(reg) == instr:
+                    rdeps[reg] = None
+
+            if self.writes[reg] == instr:
+                self.writes[reg] = None
+
+        if not any(self.ww_deps[instr]):
+            del self.ww_deps[instr]
+            del self.wr_deps[instr]
+            if instr not in self.rw_deps:
+                self.issued.remove(instr)
+
+        self.write_buff.pop(instr, None)
+
+    # Implements interfaces.Scoreboard
+    def can_issue(self, instr) -> bool:
+        if (instr not in self.rw_deps and instr not in self.ww_deps and
+                instr not in self.wr_deps):
+            return True
+
+        if any(d and d not in self.issued
+               for d in self.rw_deps.get(instr, {}).values()):
+            return False
+
+        if any(d and d not in self.issued
+               for d in self.ww_deps.get(instr, {}).values()):
+            return False
+
+        if any(d not in self.issued
+               for ds in self.wr_deps.get(instr, {}).values()
+               for d in ds):
+            return False
+
+        return True
+
+    # Implements interfaces.Scoreboard
+    def issue(self, instr) -> None:
+        if (instr not in self.rw_deps and instr not in self.ww_deps and
+                instr not in self.wr_deps):
+            return
+
+        self.issued.add(instr)
+
+    def clear_used_ports(self) -> None:
+        self.used_read_ports = 0
+        self.used_write_ports = 0
+
+    # Implements interfaces.Scoreboard
+    # pylint: disable-next=useless-parent-delegation
+    def reset(self, cntr: Counter) -> None:
+        super().reset(cntr)
+
+    # Implements interfaces.Scoreboard
+    # pylint: disable-next=useless-parent-delegation
+    def tick(self, cntr: Counter) -> None:
+        super().tick(cntr)
+
+    # Implements interfaces.Scoreboard
+    def tock(self, cntr: Counter) -> None:
+        super().tock(cntr)
+        self.clear_used_ports()
+
+    # Implements interfaces.Scoreboard
+    def pending(self) -> int:
+        # TODO(sflur): implement?
+        assert False
+
+    # Implements interfaces.Scoreboard
+    def print_state_detailed(self, file) -> None:
+        # TODO(sflur): implement?
+        assert False
+
+    # Implements interfaces.Scoreboard
+    def get_state_three_valued_header(self) -> Sequence[str]:
+        # TODO(sflur): implement?
+        assert False
+
+    # Implements interfaces.Scoreboard
+    def get_state_three_valued(self, vals: Sequence[str]) -> Sequence[str]:
+        # TODO(sflur): implement?
+        assert False
+
+
+class VecPreemptive(Preemptive):
+    """Scoreboard for vector registers.
+
+    Each register is sliced to multiple slices.
+    """
+
+    def __init__(self, uid: str, desc: Dict[str, Any], slices: int) -> None:
+        super().__init__(uid, desc)
+
+        self.slices = slices
+
+        self.used_read_ports = [0] * slices
+        self.used_write_ports = [0] * slices
+
+    def read_port_regs(self, instr,
+                       regs: Sequence[str]) -> Dict[int, Sequence[str]]:
+        """Return the regs that need to use a non-dedicated read port."""
+
+        res = {}
+
+        for rs in regs:
+            r, _, s = rs.rpartition(".")
+            if (r not in self.dedicated_read_ports
+                    # rw_deps which are not None will be read from
+                    # the write-buffer.
+                    # TODO(sflur): what are the restrictions on the
+                    # write-buffer?
+                    and self.rw_deps[instr][rs] is None):
+                res.setdefault(int(s), collections.deque()).append(r)
+
+        return res
+
+    def check_read_ports(self, instr, regs) -> bool:
+        if self.read_ports is None:
+            return True
+
+        regs = self.read_port_regs(instr, regs)
+
+        return all(self.used_read_ports[s] + len(rs) <= self.read_ports
+                   for s, rs in regs.items())
+
+    def write_port_regs(self, instr: Instruction,
+                        regs: Sequence[str]) -> Dict[int, Sequence[str]]:
+        """Return the regs that need to use a non-dedicated write port."""
+
+        res = {}
+
+        for rs in regs:
+            r, _, s = rs.rpartition(".")
+            if r not in self.dedicated_write_ports:
+                res.setdefault(int(s), collections.deque()).append(r)
+
+        return res
+
+    def check_write_ports(self, instr, regs) -> bool:
+        if self.write_ports is None:
+            return True
+
+        regs = self.write_port_regs(instr, regs)
+
+        return all(self.used_write_ports[s] + len(rs) <= self.write_ports
+                   for s, rs in regs.items())
+
+    def update_used_read_ports(self, instr, regs) -> None:
+        for s, rs in self.read_port_regs(instr, regs).items():
+            self.used_read_ports[s] += len(rs)
+
+    def update_used_write_ports(self, instr, regs) -> None:
+        for s, rs in self.write_port_regs(instr, regs).items():
+            self.used_write_ports[s] += len(rs)
+
+    def clear_used_ports(self) -> None:
+        for s in range(self.slices):
+            self.used_read_ports[s] = 0
+            self.used_write_ports[s] = 0
diff --git a/vector_pipe.py b/vector_pipe.py
new file mode 100644
index 0000000..c06c7bb
--- /dev/null
+++ b/vector_pipe.py
@@ -0,0 +1,493 @@
+"""VectorPipe module."""
+
+import collections
+import math
+from typing import Any, Dict, Sequence, Optional, Tuple, Union
+
+from buffered_queue import BufferedQueue
+import counter
+from counter import Counter
+import instruction
+from instruction import Instruction
+import interfaces
+import scoreboard
+import utilities
+
+
+class VectorPipe(interfaces.ExecPipeline):
+    """Vector pipe model.
+
+    This pipeline supports flexible chaining and tailgating.
+
+    That is
+    - The vector register is split into a number of slices.
+    - Once a vector instruction starts producing result slices,
+      they are written to the register file in order at one slice per cycle.
+    - (See ScoreboardVector for more on issuing vector instructions.)
+
+    Scalar input registers are read at the start of the instruction (first
+    cycle of first slice).
+
+    Scalar output register are written to at the end of the instruction (last
+    cycle of the last slice).
+    """
+
+    def __init__(self, name: str, kind: str, desc: Dict[str, Any], slices: int,
+                 mem_sys,
+                 rf_scoreboards: Dict[str, Union[scoreboard.Preemptive,
+                                                 scoreboard.VecPreemptive]]
+                 ) -> None:
+        super().__init__(name, kind, desc["issue_queue"], desc["depth"])
+
+        # Execution Issue Queues
+        self._eiq = BufferedQueue(desc.get("eiq_size"))
+        self._can_skip_eiq = desc["can_skip_eiq"]
+
+        # The pipeline
+        self._slices = slices
+        self._pipelined = desc["pipelined"]
+        self._stage = collections.deque([None] * self.depth)
+        self._inflight_instr = None
+        self._inflight_next_slice = 0
+
+        # The writeback buffer
+        self._writebackq = BufferedQueue(desc.get("writeback_buff_size"))
+
+        # Interface to memory
+        self._mem = (mem_sys.elements[desc["memory_interface"]]
+                     if "memory_interface" in desc else None)
+
+        self._load_stage = desc.get("load_stage")
+        self._fixed_load_latency = desc.get("fixed_load_latency")
+        self._stalling_loads = {}
+
+        self._store_stage = desc.get("store_stage")
+        self._fixed_store_latency = desc.get("fixed_store_latency")
+        self._stalling_stores = {}
+
+        self._rf_scoreboards = rf_scoreboards
+
+    def eslices(self, instr: Instruction) -> int:
+        """The number of slices required to execute `instr`."""
+        return math.ceil(instr.max_emul() * self._slices)
+
+    def slice(self, accesses: Sequence[int], index: int,
+              eslices: int) -> Tuple[int, int]:
+        """The memory access location and size, of a given slice.
+
+        Args:
+          accesses: a sequence of memory locations, one for each vector element.
+          index: the index of the required slice.
+          eslices: the total number of slices `accesses` should be split to.
+
+        # TODO(sflur): return the access size in bytes, instead of `count`.
+        Return: (access, count), where access is the first byte of the memory
+        location that should be accessed, and count is the number of elements
+        that should be accessed.
+        """
+        alen = len(accesses)
+        slen = alen // eslices
+        start = index * slen
+        slen = min(slen, alen - start)
+        return (accesses[start], slen)
+
+    def reg_read_stall(self, instr: Instruction, s: int) -> bool:
+        return any(not self._rf_scoreboards[rf].can_read(instr, seq[s])
+                   for rf, seq in self.input_seq_by_type(instr).items()
+                   if len(seq) > s)
+
+    def reg_write_stall(self, instr: Instruction, s: int) -> bool:
+        return any(not self._rf_scoreboards[rf].can_write(instr, seq[s])
+                   for rf, seq in self.output_seq_by_type(instr).items()
+                   if len(seq) > s)
+
+    def sb_reg_read(self, instr: Instruction, s: int) -> None:
+        for rf, seq in self.input_seq_by_type(instr).items():
+            if len(seq) > s and seq[s]:
+                self._rf_scoreboards[rf].read(instr, seq[s])
+
+    def sb_buff_reg_write(self, instr: Instruction, s: int) -> None:
+        for rf, seq in self.output_seq_by_type(instr).items():
+            if len(seq) > s:
+                self._rf_scoreboards[rf].buff_write(instr, seq[s])
+
+    def sb_reg_write(self, instr: Instruction, s: int) -> None:
+        for rf, seq in self.output_seq_by_type(instr).items():
+            if len(seq) > s:
+                self._rf_scoreboards[rf].write(instr, seq[s])
+
+    def do_reg_writeback(self) -> None:
+        if self._writebackq:
+            instr, s = self._writebackq[0]
+            if not self.reg_write_stall(instr, s):
+                self.sb_reg_write(instr, s)
+                self._writebackq.popleft()
+                if s + 1 == self.eslices(instr):
+                    self.retired_instrs.append(instr)
+
+    def stall(self, cntr: Counter) -> bool:
+        # Check if last stage needs to do reg writes, and the writeback buffer
+        # is full.
+        if (self._stage[-1] and any(
+                self._stage[-1][1] < len(seq) and seq[self._stage[-1][1]]
+                for _, seq in
+                  self.output_seq_by_type(self._stage[-1][0]).items()) and
+            self._writebackq.is_buffer_full()):
+            return True
+
+        # Check if memory accesses are waiting for reply.
+        if (any(self._stalling_loads.values()) or
+            any(self._stalling_stores.values())):
+            cntr.vector_load_store_stall += 1
+            return True
+
+        return False
+
+    def do_load(self) -> None:
+        if self._load_stage is None:
+            return
+
+        if (self._stage[self._load_stage] and
+            self._stage[self._load_stage][0].loads):
+            instr, s = self._stage[self._load_stage]
+            load, _size = self.slice(instr.loads, s, self.eslices(instr))
+            if (instr, s, load) not in self._stalling_loads:
+                # TODO(sflur): pass size to issue_load
+                self._mem.issue_load((instr, s), load)
+                self._stalling_loads[(instr, s, load)] = None
+
+        if (self._stage[self._load_stage + self._fixed_load_latency] and
+                self._stage[self._load_stage +
+                            self._fixed_load_latency][0].loads):
+            instr, s = self._stage[self._load_stage + self._fixed_load_latency]
+            load, _ = self.slice(instr.loads, s, self.eslices(instr))
+            if self._stalling_loads[(instr, s, load)] is None:
+                self._stalling_loads[(instr, s, load)] = True
+            for load in self._mem.take_load_replys((instr, s)):
+                self._stalling_loads[(instr, s, load)] = False
+
+    def do_store(self) -> None:
+        if self._store_stage is None:
+            return
+
+        if self._stage[self._store_stage] and self._stage[
+                self._store_stage][0].stores:
+            instr, s = self._stage[self._store_stage]
+            store, _size = self.slice(instr.stores, s, self.eslices(instr))
+            if (instr, s, store) not in self._stalling_stores:
+                # TODO(sflur): pass size to issue_store
+                self._mem.issue_store((instr, s), store)
+                self._stalling_stores[(instr, s, store)] = None
+
+        if (self._stage[self._store_stage + self._fixed_store_latency] and
+                self._stage[self._store_stage +
+                           self._fixed_store_latency][0].stores):
+            instr, s = self._stage[self._store_stage +
+                                   self._fixed_store_latency]
+            store, _ = self.slice(instr.stores, s, self.eslices(instr))
+            if self._stalling_stores[(instr, s, store)] is None:
+                self._stalling_stores[(instr, s, store)] = True
+            for store in self._mem.take_store_replys((instr, s)):
+                self._stalling_stores[(instr, s, store)] = False
+
+    # Implements interfaces.ExecPipeline
+    def reset(self, cntr: Counter) -> None:
+        super().reset(cntr)
+        # TODO(sflur): implement proper reset
+        cntr.utilizations[f"{self.name}.eiq"] = counter.Utilization(
+            self._eiq.size)
+        cntr.utilizations[f"{self.name}.pipe"] = counter.Utilization(
+            len(self._stage))
+        cntr.utilizations[f"{self.name}.wbq"] = counter.Utilization(
+            self._writebackq.size)
+
+    # Implements interfaces.ExecPipeline
+    def tick(self, cntr: Counter) -> None:
+        super().tick(cntr)
+
+        self.retired_instrs.clear()
+
+        self.do_reg_writeback()
+
+        if not self.stall(cntr):
+            # Cleanup self.stalling_loads
+            if (self._load_stage is not None and
+                    self._stage[self._load_stage + self._fixed_load_latency] and
+                    self._stage[self._load_stage +
+                               self._fixed_load_latency][0].loads):
+                instr, s = self._stage[self._load_stage +
+                                       self._fixed_load_latency]
+                load, _ = self.slice(instr.loads, s, self.eslices(instr))
+                # The assertion holds because self.stall() above is True.
+                assert not self._stalling_loads.get((instr, s, load), False)
+                del self._stalling_loads[(instr, s, load)]
+
+            # Cleanup self.stalling_stores
+            if (self._store_stage is not None and
+                    self._stage[self._store_stage +
+                                self._fixed_store_latency] and
+                    self._stage[self._store_stage +
+                               self._fixed_store_latency][0].stores):
+                instr, s = self._stage[self._store_stage +
+                                      self._fixed_store_latency]
+                store, _ = self.slice(instr.stores, s, self.eslices(instr))
+                # The assertion holds because self.stall() above is True.
+                assert not self._stalling_stores.get((instr, s, store), False)
+                del self._stalling_stores[(instr, s, store)]
+
+            # Shift stages
+            st = self._stage.pop()
+            if st:
+                instr, s = st
+                if any(
+                        len(seq) > s and seq[s]
+                        for _, seq in self.output_seq_by_type(instr).items()):
+                    self._writebackq.buffer((instr, s))
+                    cntr.utilizations[f"{self.name}.wbq"].count += 1
+                    self.sb_buff_reg_write(instr, s)
+                elif s + 1 == self.eslices(instr):
+                    self.retired_instrs.append(instr)
+
+            # Issue the next slice into the piprline.
+            if (self._inflight_instr and not self.reg_read_stall(
+                    self._inflight_instr, self._inflight_next_slice)):
+                self.sb_reg_read(self._inflight_instr,
+                                 self._inflight_next_slice)
+
+                self._stage.appendleft(
+                    (self._inflight_instr, self._inflight_next_slice))
+                cntr.utilizations[f"{self.name}.pipe"].count += 1
+                self._inflight_next_slice += 1
+                if self._inflight_next_slice == self.eslices(
+                        self._inflight_instr):
+                    self._inflight_instr = None
+            else:
+                self._stage.appendleft(None)
+
+        self.do_load()
+        self.do_store()
+
+        # Try to issue each of the instructions in `eiq`.
+        if self.is_ready():
+            for _ in range(len(self._eiq)):
+                instr = self._eiq.popleft()
+                if self.try_issue(instr, cntr):
+                    break
+
+                self._eiq.append(instr)
+
+    # Implements interfaces.ExecPipeline
+    def tock(self, cntr: Counter) -> None:
+        super().tock(cntr)
+
+        self.retired_instrs.clear()
+
+        cntr.utilizations[f"{self.name}.pipe"].occupied += len(
+            list(1 for i in self._stage if i))
+
+        self._eiq.flush()
+        cntr.utilizations[f"{self.name}.eiq"].occupied += len(self._eiq)
+
+        self._writebackq.flush()
+        cntr.utilizations[f"{self.name}.wbq"].occupied += len(self._writebackq)
+
+    # Implements interfaces.ExecPipeline
+    def pending(self) -> int:
+        eiq_count = len(list(self._eiq.chain()))
+        pipe_count = len(list(1 for i in self._stage if i))
+        wbq_count = len(list(self._writebackq.chain()))
+        return eiq_count + pipe_count + wbq_count
+
+    # Implements interfaces.ExecPipeline
+    def try_dispatch(self, instr: Instruction, cntr: Counter) -> bool:
+        if self._eiq.is_buffer_full():
+            return False
+
+        inputs = self.input_seq_by_type(instr)
+        outputs = self.output_seq_by_type(instr)
+
+        for rf in inputs.keys() | outputs.keys():
+            reads = utilities.flatten(inputs.get(rf, []))
+            writes = utilities.flatten(outputs.get(rf, []))
+            self._rf_scoreboards[rf].insert_accesses(instr, reg_reads=reads,
+                                                     reg_writes=writes)
+
+        if not (self._can_skip_eiq and self.is_ready() and
+                self.try_issue(instr, cntr)):
+            self._eiq.buffer(instr)
+            cntr.utilizations[f"{self.name}.eiq"].count += 1
+
+        if instr.loads or instr.stores:
+            cntr.vector_load_store += 1
+
+        return True
+
+    def is_ready(self) -> bool:
+        """Check if the pipe is ready to accept a new instruction."""
+        if self._pipelined:
+            return self._inflight_instr is None and self._stage[0] is None
+
+        return self._inflight_instr is None and all(
+            s is None for s in self._stage)
+
+    def try_issue(self, instr: Instruction, cntr: Counter) -> bool:
+        """Issue an instruction."""
+
+        if not all(sb.can_issue(instr) for sb in self._rf_scoreboards.values()):
+            return False
+
+        if self.reg_read_stall(instr, 0):
+            return False
+
+        assert self._stage[0] is None
+        self._stage[0] = (instr, 0)
+        cntr.utilizations[f"{self.name}.pipe"].count += 1
+
+        assert self._inflight_instr is None
+        if 1 < self.eslices(instr):
+            self._inflight_instr = instr
+            self._inflight_next_slice = 1
+
+        for sb in self._rf_scoreboards.values():
+            sb.issue(instr)
+
+        self.sb_reg_read(instr, 0)
+
+        return True
+
+    def vec_reg_seq(self, reg: str, input_reg: bool, emul: Union[int, float],
+                    max_emul: Union[int, float]) -> Sequence[Optional[str]]:
+        base = int(reg[1:])
+        if emul < 1:
+            seq = [f"{reg}.{s}" for s in range(math.ceil(emul * self._slices))]
+        else:
+            emul = int(emul)
+            seq = [
+                f"{reg[0]}{base + g}.{s}" for g in range(emul)
+                for s in range(self._slices)
+            ]
+
+        if (emul == max_emul or (emul < 1 and self._slices < 1 / emul)):
+            return seq
+
+        assert int(max_emul / emul) == 2
+
+        # Interleave Nones with `seq`
+        if input_reg:
+            seq = zip(seq, [None] * len(seq))
+        else:
+            seq = zip([None] * len(seq), seq)
+        return utilities.flatten(seq)
+
+    def input_seq(self, instr: Instruction, reg: str) -> Sequence[str]:
+        if instruction.is_vector_register(reg):
+            assert instr.lmul is not None
+
+            # TODO(sflur): anymore cases of input widening?
+            if ((instr.mnemonic.endswith(".wv") or
+                 instr.mnemonic.endswith(".wx") or
+                 instr.mnemonic.endswith(".wf") or
+                 instr.mnemonic.endswith(".wi")) and instr.operands[1] == reg):
+                emul = 2 * instr.lmul
+            else:
+                emul = instr.lmul
+
+            return self.vec_reg_seq(reg, True, emul, instr.max_emul())
+
+        return [reg]
+
+    def output_seq(self, instr: Instruction, reg: str) -> Sequence[str]:
+        if instruction.is_vector_register(reg):
+            assert instr.lmul is not None
+
+            # TODO(sflur): anymore cases of output widening?
+            if ((instr.mnemonic.startswith("vw") or
+                 instr.mnemonic.startswith("vfw")) and
+                    instr.operands[0] == reg):
+                emul = 2 * instr.lmul
+            else:
+                emul = instr.lmul
+
+            return self.vec_reg_seq(reg, False, emul, instr.max_emul())
+
+        res = [None] * self.eslices(instr)
+        res[-1] = reg
+        return res
+
+    def input_seq_by_type(
+            self, instr: Instruction) -> Dict[str, Sequence[Sequence[str]]]:
+        """Compute a map from register-files to sequences of input register
+        sets.
+
+        A register file is mapped to a sequence of sets, where set i is the set
+        of registers that will be read from by slice i.
+        """
+        res = {}
+
+        for ty, regs in instr.inputs_by_type().items():
+            seq = [collections.deque() for _ in range(self.eslices(instr))]
+
+            for reg in regs:
+                for i, r in enumerate(self.input_seq(instr, reg)):
+                    if r:
+                        seq[i].append(r)
+
+            res[ty] = seq
+
+        return res
+
+    def output_seq_by_type(
+            self, instr: Instruction) -> Dict[str, Sequence[Sequence[str]]]:
+        """Compute a map from register-files to sequences of output register
+        sets.
+
+        A register file is mapped to a sequence of sets, where set i is the set
+        of registers that will be written to by slice i.
+        """
+        res = {}
+
+        for ty, regs in instr.outputs_by_type().items():
+            seq = [collections.deque() for _ in range(self.eslices(instr))]
+
+            for reg in regs:
+                for i, r in enumerate(self.output_seq(instr, reg)):
+                    if r:
+                        seq[i].append(r)
+
+            res[ty] = seq
+
+        return res
+
+    # Implements interfaces.ExecPipeline
+    def print_state_detailed(self, file) -> None:
+        eiq_str = ", ".join(str(i) for i in reversed(list(self._eiq.chain())))
+        stages = ", ".join(f"{i[0]} ({i[1]})" if i else "-"
+                           for i in self._stage)
+        wbq_str = ", ".join(f"{i[0]} ({i[1]})"
+                            for i in reversed(list(self._writebackq.chain())))
+
+        pipe_str = (f"{eiq_str if eiq_str else '-'}"
+                    f" > {stages}"
+                    f" > {wbq_str if wbq_str else '-'}")
+
+        print(f"[{self.name}] {pipe_str}", file=file)
+
+    # Implements interfaces.ExecPipeline
+    def get_state_three_valued_header(self) -> Sequence[str]:
+        return ["eiq", self.kind, "wbq"]
+
+    # Implements interfaces.ExecPipeline
+    def get_state_three_valued(self, vals: Sequence[str]) -> Sequence[str]:
+        if all(self._stage):
+            # Full
+            pipe_str = vals[2]
+        elif any(self._stage):
+            # Partial
+            pipe_str = vals[1]
+        else:
+            # Empty
+            pipe_str = vals[0]
+
+        return [self._eiq.pp_three_valued(vals),
+                pipe_str,
+                self._writebackq.pp_three_valued(vals)]