[otbnsim] Collect execution statistics from ISS

The ISS now has the ability to connect statistics during the program
execution and to analyze those statistics. The analysis includes:

* Execution statistics
* Instruction histograms
* Function call statistics
* Loop statistics

This commit also contains a tiny test to ensure that the statistics we
collect match expectations.

Signed-off-by: Philipp Wagner <phw@lowrisc.org>
diff --git a/hw/ip/otbn/dv/otbnsim/sim/sim.py b/hw/ip/otbn/dv/otbnsim/sim/sim.py
index 5383443..4af4012 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/sim.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/sim.py
@@ -6,6 +6,7 @@
 
 from .isa import OTBNInsn
 from .state import OTBNState
+from .stats import ExecutionStats
 from .trace import Trace
 
 _TEST_RND_DATA = \
@@ -16,6 +17,7 @@
     def __init__(self) -> None:
         self.state = OTBNState()
         self.program = []  # type: List[OTBNInsn]
+        self.stats = ExecutionStats()
 
     def load_program(self, program: List[OTBNInsn]) -> None:
         self.program = program.copy()
@@ -23,7 +25,7 @@
     def load_data(self, data: bytes) -> None:
         self.state.dmem.load_le_words(data)
 
-    def run(self, verbose: bool) -> int:
+    def run(self, verbose: bool, collect_stats: bool) -> int:
         '''Run until ECALL.
 
         Return the number of cycles taken.
@@ -34,7 +36,7 @@
         # valid when in free running mode as nothing else will.
         self.state.set_urnd_reseed_complete()
         while self.state.running:
-            self.step(verbose)
+            self.step(verbose, collect_stats)
             insn_count += 1
 
             if self.state.wsrs.RND.pending_request:
@@ -44,7 +46,9 @@
 
         return insn_count
 
-    def step(self, verbose: bool) -> Tuple[Optional[OTBNInsn], List[Trace]]:
+    def step(self,
+             verbose: bool,
+             collect_stats: bool) -> Tuple[Optional[OTBNInsn], List[Trace]]:
         '''Run a single instruction.
 
         Returns the instruction, together with a list of the architectural
@@ -78,11 +82,17 @@
             self.state.commit(sim_stalled=True)
             disasm = '(stall)'
             changes = []
+
+            if collect_stats:
+                self.stats.record_stall()
         else:
             self.state.pre_insn(insn.affects_control)
             insn.execute(self.state)
             self.state.post_insn()
 
+            if collect_stats:
+                self.stats.record_insn(pc_before, insn, self.state)
+
             if self.state.pending_halt:
                 # Roll back any pending state changes, ensuring that a faulting
                 # instruction doesn't actually do anything (this also aborts an
diff --git a/hw/ip/otbn/dv/otbnsim/sim/state.py b/hw/ip/otbn/dv/otbnsim/sim/state.py
index db8effe..325991f 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/state.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/state.py
@@ -29,6 +29,8 @@
         self.pc = 0
         self.pc_next = None  # type: Optional[int]
 
+        self.start_addr = None  # type: Optional[int]
+
         _, imem_size = get_memory_layout()['IMEM']
         self.imem_size = imem_size
 
@@ -147,6 +149,7 @@
         self._urnd_reseed_complete = False
 
         self.pc = addr
+        self.start_addr = addr
 
         # Reset CSRs, WSRs and loop stack
         self.csrs = CSRFile()
diff --git a/hw/ip/otbn/dv/otbnsim/sim/stats.py b/hw/ip/otbn/dv/otbnsim/sim/stats.py
new file mode 100644
index 0000000..4481ff3
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/sim/stats.py
@@ -0,0 +1,298 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+from collections import Counter
+import typing
+from typing import Dict, List, Optional, Tuple
+
+from elftools.dwarf.dwarfinfo import DWARFInfo  # type: ignore
+from elftools.elf.elffile import ELFFile  # type: ignore
+from elftools.elf.sections import SymbolTableSection  # type: ignore
+from tabulate import tabulate
+
+from .insn import JAL, JALR, LOOP, LOOPI
+from .isa import OTBNInsn
+from .state import OTBNState
+
+
+class ExecutionStats:
+    def __init__(self) -> None:
+        self.stall_count = 0
+        self.insn_histo = Counter()  # type: typing.Counter[str]
+        self.func_calls = []  # type: List[Dict[str, int]]
+        self.loops = []  # type: List[Dict[str, int]]
+
+    @property
+    def insn_count(self) -> int:
+        ''' Get the number of executed instructions. '''
+        return sum(self.insn_histo.values())
+
+    def record_stall(self) -> None:
+        ''' Record a single stall cycle. '''
+        self.stall_count += 1
+
+    def record_insn(self,
+                    pc: int,
+                    insn: OTBNInsn,
+                    state_before_commit: OTBNState) -> None:
+        ''' Record the execution of an instruction. '''
+
+        # Instruction histogram
+        self.insn_histo[insn.insn.mnemonic] += 1
+
+        # Function calls
+        # - Direct function calls: jal x1, <offset>
+        # - Indirect function calls: jalr x1, <grs1>, 0
+        if (isinstance(insn, JAL) or isinstance(insn, JALR)) and insn.grd == 1:
+            call_stack = state_before_commit.peek_call_stack()
+            if call_stack:
+                caller_func = call_stack[0]
+            else:
+                assert state_before_commit.start_addr is not None
+                caller_func = state_before_commit.start_addr
+
+            assert state_before_commit.pc_next is not None
+            self.func_calls.append({
+                'call_site': pc,
+                'caller_func': caller_func,
+                'callee_func': state_before_commit.pc_next,
+            })
+
+        # Loops
+        if isinstance(insn, LOOP) or isinstance(insn, LOOPI):
+            iterations = state_before_commit.loop_stack.stack[-1].loop_count
+            self.loops.append({
+                'loop_addr': pc,
+                'loop_len': insn.bodysize,
+                'iterations': iterations,
+            })
+
+
+def _dwarf_decode_file_line(dwarf_info: DWARFInfo,
+                            address: int) -> Optional[Tuple[str, int]]:
+    # Go over all the line programs in the DWARF information, looking for
+    # one that describes the given address.
+    for CU in dwarf_info.iter_CUs():
+        # First, look at line programs to find the file/line for the address
+        lineprog = dwarf_info.line_program_for_CU(CU)
+        prevstate = None
+        for entry in lineprog.get_entries():
+            # We're interested in those entries where a new state is assigned
+            if entry.state is None:
+                continue
+            if entry.state.end_sequence:
+                # if the line number sequence ends, clear prevstate.
+                prevstate = None
+                continue
+            # Looking for a range of addresses in two consecutive states that
+            # contain the required address.
+            if prevstate and prevstate.address <= address < entry.state.address:
+                filename = lineprog['file_entry'][prevstate.file - 1].name.decode('utf-8')
+                line = prevstate.line
+                return filename, line
+            prevstate = entry.state
+    return None
+
+
+def _get_addr_symbol_map(elf_file: ELFFile) -> Dict[int, str]:
+    section = elf_file.get_section_by_name('.symtab')
+
+    if not isinstance(section, SymbolTableSection):
+        return {}
+
+    return {sym.entry.st_value: sym.name for sym in section.iter_symbols()}
+
+
+class ExecutionStatAnalyzer:
+    # Assumed clock frequency of OTBN, in MHz.
+    FREQ_MHZ = 100
+
+    def __init__(self, stats: ExecutionStats, elf_file_path: str):
+        self._elf_file = ELFFile(open(elf_file_path, 'rb'))
+        self._stats = stats
+        self._addr_symbol_map = _get_addr_symbol_map(self._elf_file)
+
+    def _describe_imem_addr(self, address: int) -> str:
+        symbol_name = None
+        if address in self._addr_symbol_map:
+            symbol_name = self._addr_symbol_map[address]
+        else:
+            # |func_addr| is the largest possible |sym_addr| which is at most
+            # |address|.
+            func_addr = 0
+            for sym_addr in self._addr_symbol_map.keys():
+                if sym_addr <= address and sym_addr > func_addr:
+                    func_addr = sym_addr
+            symbol_name = self._addr_symbol_map[func_addr] + f"+{address - func_addr:#x}"
+
+        file_line = None
+        if self._elf_file.has_dwarf_info():
+            dwarf_info = self._elf_file.get_dwarf_info()
+            file_line = _dwarf_decode_file_line(dwarf_info, address)
+
+        add_info = []
+        if symbol_name:
+            add_info.append(f"{symbol_name}")
+        if file_line:
+            add_info.append(f"at {file_line[0]}:{file_line[1]}")
+
+        str = f"{address:#x}"
+        if add_info:
+            str += ' (' + ' '.join(add_info) + ')'
+        return str
+
+    def dump(self) -> str:
+        out = ""
+        out += "\n"
+        out += "Execution time\n"
+        out += "--------------\n"
+        out += self._dump_execution_time()
+        out += "\n\n"
+        out += "Instruction frequencies\n"
+        out += "-----------------------\n"
+        out += self._dump_insn_histo()
+        out += "\n\n"
+        out += "Function call statistics\n"
+        out += "------------------------\n"
+        out += self._dump_function_call_stats()
+        out += "\n\n"
+        out += "Loop statistics\n"
+        out += "---------------\n"
+        out += self._dump_loop_stats()
+        out += "\n"
+
+        return out
+
+    def _dump_execution_time(self) -> str:
+        insn_count = self._stats.insn_count
+        stall_count = self._stats.stall_count
+        stall_percent = stall_count / insn_count * 100
+        cycles = insn_count + stall_count
+        time_ms = cycles / (self.FREQ_MHZ * 1e6) * 1e3
+
+        out = f"OTBN executed {insn_count} instructions in {cycles} cycles.\n"
+
+        out += f"The execution stalled for {stall_count} cycles "
+        out += f"({stall_percent:.01f} percent).\n"
+
+        out += f"The execution would take {time_ms:.02f} ms "
+        out += f"@ {self.FREQ_MHZ} MHz.\n"
+        return out
+
+    def _dump_insn_histo(self) -> str:
+        return tabulate(self._stats.insn_histo.most_common(),
+                        headers=['instruction', 'count']) + "\n"
+
+    def _dump_function_call_stats(self) -> str:
+        """ Dump function call statistics """
+
+        if not self._stats.func_calls:
+            return "No functions were called.\n"
+
+        out = ""
+
+        # TODO: Add the start address as function?
+
+        # Build function call graphs and a call site index
+        # caller-indexed == forward, callee-indexed == reverse
+        # The call graphs are on function granularity; the call sites dictionary
+        # is indexed by the called function, but uses the call site as value.
+        callgraph = {}  # type: Dict[int, typing.Counter[int]]
+        rev_callgraph = {}  # type: Dict[int, typing.Counter[int]]
+        rev_callsites = {}  # type: Dict[int, typing.Counter[int]]
+        for c in self._stats.func_calls:
+            if c['caller_func'] not in callgraph:
+                callgraph[c['caller_func']] = Counter()
+            callgraph[c['caller_func']][c['callee_func']] += 1
+
+            if c['callee_func'] not in rev_callgraph:
+                rev_callgraph[c['callee_func']] = Counter()
+            rev_callgraph[c['callee_func']][c['caller_func']] += 1
+
+            if c['callee_func'] not in rev_callsites:
+                rev_callsites[c['callee_func']] = Counter()
+            rev_callsites[c['callee_func']][c['call_site']] += 1
+
+        total_leaf_calls = 0
+        total_calls_to_funcs_with_one_callsite = 0
+        total_func_calls = 0
+        for rev_callee_func, rev_caller_funcs in rev_callgraph.items():
+            has_one_callsite = False
+            func = self._describe_imem_addr(rev_callee_func)
+            out += f"Function {func}\n"
+            out += "  is called from the following functions\n"
+            for rev_caller_func, cnt in rev_caller_funcs.most_common():
+                func = self._describe_imem_addr(rev_caller_func)
+                out += f"    * {cnt} times by function {func}\n"
+            out += "  from the following call sites\n"
+            for rev_callsite, cnt in rev_callsites[rev_callee_func].most_common():
+                func = self._describe_imem_addr(rev_callsite)
+                out += f"    * {cnt} times from {func}\n"
+
+            has_one_callsite = len(rev_callsites[rev_callee_func]) == 1
+
+            out += "  calls\n"
+            if rev_callee_func not in callgraph:
+                out += "    no other function (leaf function).\n"
+
+                if not has_one_callsite:
+                    # We don't count it as leaf function call if it has only one
+                    # call site to prevent double-counting these as optimization
+                    # opportunity.
+                    total_leaf_calls += sum(rev_caller_funcs.values())
+            else:
+                caller_funcs = callgraph[rev_callee_func]
+                for caller_func, cnt in caller_funcs.most_common():
+                    func = self._describe_imem_addr(caller_func)
+                    out += f"    * {cnt} times function {func}\n"
+            out += "\n"
+
+            if has_one_callsite:
+                total_calls_to_funcs_with_one_callsite += (
+                    rev_caller_funcs.most_common()[0][1])
+
+            total_func_calls += sum(rev_caller_funcs.values())
+        out += "\n"
+
+        # Function call statistics
+        total_calls_req_call = (total_func_calls - total_leaf_calls -
+                                total_calls_to_funcs_with_one_callsite)
+        out += f"Of a total of {total_func_calls} function calls, there were\n"
+        out += (f"  {total_calls_to_funcs_with_one_callsite} function calls "
+                f"to a function with only one call site (call/ret can be "
+                f"replaced with static jumps)\n")
+        out += (f"  {total_leaf_calls} leaf function calls "
+                f"(no function prologue/epilogue needed)\n")
+        out += (f"Overall, {total_calls_req_call} of {total_func_calls} "
+                f"({(total_calls_req_call / total_func_calls * 100):.02f} "
+                f"percent) need full function call semantics.\n")
+
+        return out
+
+    def _dump_loop_stats(self) -> str:
+        loops = self._stats.loops
+        loop_cnt = len(loops)
+
+        out = f"Loops: {loop_cnt}\n"
+
+        if loop_cnt != 0:
+            loop_len_values = [loop['loop_len'] for loop in loops]
+            loop_len_min = min(loop_len_values)
+            loop_len_max = max(loop_len_values)
+            loop_len_avg = sum(loop_len_values) / loop_cnt
+
+            loop_iterations_values = [loop['iterations'] for loop in loops]
+            loop_iterations_min = min(loop_iterations_values)
+            loop_iterations_max = max(loop_iterations_values)
+            loop_iterations_avg = sum(loop_iterations_values) / loop_cnt
+
+            out += "Loop body length (instructions): "
+            out += f"min: {loop_len_min}, max: {loop_len_max}, "
+            out += f"avg: {loop_len_avg:.02f}\n"
+
+            out += f"Number of iterations: min: {loop_iterations_min}, "
+            out += f"max: {loop_iterations_max}, "
+            out += f"avg: {loop_iterations_avg:.02f}\n"
+
+        return out
diff --git a/hw/ip/otbn/dv/otbnsim/standalone.py b/hw/ip/otbn/dv/otbnsim/standalone.py
index 88482d9..834a457 100755
--- a/hw/ip/otbn/dv/otbnsim/standalone.py
+++ b/hw/ip/otbn/dv/otbnsim/standalone.py
@@ -8,6 +8,7 @@
 
 from sim.elf import load_elf
 from sim.sim import OTBNSim
+from sim.stats import ExecutionStatAnalyzer
 
 
 def main() -> int:
@@ -28,14 +29,23 @@
         help=("after execution, write the GPR and WDR contents to this file. "
               "Use '-' to write to STDOUT.")
     )
+    parser.add_argument(
+        '--dump-stats',
+        metavar="FILE",
+        type=argparse.FileType('w'),
+        help=("after execution, write execution statistics to this file. "
+              "Use '-' to write to STDOUT.")
+    )
 
     args = parser.parse_args()
 
+    collect_stats = args.dump_stats is not None
+
     sim = OTBNSim()
     load_elf(sim, args.elf)
 
     sim.state.start(0)
-    sim.run(verbose=args.verbose)
+    sim.run(verbose=args.verbose, collect_stats=collect_stats)
 
     if args.dump_dmem is not None:
         args.dump_dmem.write(sim.dump_data())
@@ -46,6 +56,10 @@
         for idx, value in enumerate(sim.state.wdrs.peek_unsigned_values()):
             args.dump_regs.write(' w{:<2} = 0x{:064x}\n'.format(idx, value))
 
+    if collect_stats:
+        stat_analyzer = ExecutionStatAnalyzer(sim.stats, args.elf)
+        args.dump_stats.write(stat_analyzer.dump())
+
     return 0
 
 
diff --git a/hw/ip/otbn/dv/otbnsim/stepped.py b/hw/ip/otbn/dv/otbnsim/stepped.py
index 44edeb3..fa092b2 100755
--- a/hw/ip/otbn/dv/otbnsim/stepped.py
+++ b/hw/ip/otbn/dv/otbnsim/stepped.py
@@ -89,7 +89,7 @@
     pc = sim.state.pc
     assert 0 == pc & 3
 
-    insn, changes = sim.step(verbose=False)
+    insn, changes = sim.step(verbose=False, collect_stats=False)
 
     if insn is None:
         hdr = 'STALL'
@@ -108,7 +108,7 @@
         raise ValueError('run expects zero arguments. Got {}.'
                          .format(args))
 
-    num_cycles = sim.run(verbose=False)
+    num_cycles = sim.run(verbose=False, collect_stats=False)
     print(' ran for {} cycles'.format(num_cycles))
 
 
diff --git a/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.exp b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.exp
new file mode 100644
index 0000000..0cd9f40
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.exp
@@ -0,0 +1,5 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+x11 = 4
diff --git a/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.s b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.s
new file mode 100644
index 0000000..3ad5182
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/direct-call.s
@@ -0,0 +1,16 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Call a subroutine directly. */
+
+main:
+  addi    x11, x0, 5
+
+  jal     x1, subroutine
+
+  ecall
+
+subroutine:
+  addi    x11, x11, -1
+  jalr    x0, x1, 0
diff --git a/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.exp b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.exp
new file mode 100644
index 0000000..96898e5
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.exp
@@ -0,0 +1,6 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+x10 = 0x10
+x11 = 4
diff --git a/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.s b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.s
new file mode 100644
index 0000000..588bd36
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/test/simple/subroutines/indirect-call.s
@@ -0,0 +1,17 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Call a subroutine through a function pointer. */
+
+main:
+  addi    x11, x0, 5
+
+  addi    x10, x0, 0x10
+  jalr    x1, x10, 0
+
+  ecall
+
+subroutine:
+  addi    x11, x11, -1
+  jalr    x0, x1, 0
diff --git a/hw/ip/otbn/dv/otbnsim/test/stats_test.py b/hw/ip/otbn/dv/otbnsim/test/stats_test.py
new file mode 100644
index 0000000..501fdde
--- /dev/null
+++ b/hw/ip/otbn/dv/otbnsim/test/stats_test.py
@@ -0,0 +1,75 @@
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import py
+import os
+
+from sim.sim import OTBNSim
+from sim.stats import ExecutionStats
+from sim.elf import load_elf
+import testutil
+
+
+def _run_sim_for_stats(asm_file: str, tmpdir: str) -> ExecutionStats:
+    """ Run the OTBN simulator, collect statistics, and return them. """
+
+    assert os.path.exists(asm_file)
+    elf_file = testutil.asm_and_link_one_file(asm_file, tmpdir)
+
+    sim = OTBNSim()
+    load_elf(sim, elf_file)
+
+    sim.state.start(0)
+    sim.run(verbose=False, collect_stats=True)
+
+    return sim.stats
+
+
+def test_general_and_loop(tmpdir: py.path.local) -> None:
+    """ Test the collection of general statistics as well as loop stats. """
+
+    asm_file = os.path.join(os.path.dirname(__file__),
+                            'simple', 'loops', 'loops.s')
+    stats = _run_sim_for_stats(asm_file, str(tmpdir))
+
+    # General statistics
+    assert stats.stall_count == 2
+    assert stats.insn_count == 28
+    assert stats.insn_histo == {'addi': 22, 'loop': 4, 'loopi': 1, 'ecall': 1}
+    assert stats.func_calls == []
+
+    # Loop statistics.
+    exp = [
+        # Outer LOOPI
+        {'iterations': 4, 'loop_addr': 8, 'loop_len': 4},
+
+        # Inner LOOP
+        {'iterations': 3, 'loop_addr': 16, 'loop_len': 1},
+        {'iterations': 3, 'loop_addr': 16, 'loop_len': 1},
+        {'iterations': 3, 'loop_addr': 16, 'loop_len': 1},
+        {'iterations': 3, 'loop_addr': 16, 'loop_len': 1}
+    ]
+    assert stats.loops == exp
+
+
+def test_func_call_direct(tmpdir: py.path.local) -> None:
+    """ Test the collection of statistics related to loops. """
+
+    asm_file = os.path.join(os.path.dirname(__file__),
+                            'simple', 'subroutines', 'direct-call.s')
+    stats = _run_sim_for_stats(asm_file, str(tmpdir))
+
+    exp = [{'call_site': 4, 'callee_func': 12, 'caller_func': 0}]
+    assert stats.func_calls == exp
+
+
+def test_func_call_indirect(tmpdir: py.path.local) -> None:
+    """ Test the collection of statistics related to loops. """
+
+    asm_file = os.path.join(os.path.dirname(__file__),
+                            'simple', 'subroutines', 'indirect-call.s')
+    stats = _run_sim_for_stats(asm_file, str(tmpdir))
+
+    exp = [{'call_site': 8, 'callee_func': 16, 'caller_func': 0}]
+    assert stats.func_calls == exp