Update veri-titan to secure-foundations/veri-titan@a5b18fa

Update code from upstream repository https://github.com/secure-
foundations/veri-titan.git to revision
a5b18fa0bea5d943c9267d3561cbd2704764a868

* Make a separate procedure for throwing deliberate errors. (Jade
  Philipoom)
* Make the loop iteration counter count write_to_dmem loops in
  addition to modexp (Jade Philipoom)
* Add loop iteration counter for fault injection hardening. (Jade
  Philipoom)
* Add comments to generated OTBN modexp (Jade Philipoom)
* Add back auto-generated files with a new .gitignore pattern (Jade
  Philipoom)
* Make Z3 target more platform-independent (Bryan Parno)
* progress on the polymul (yizhou7)
* poly mul and poly scale (yizhou7)
* rv bit rev (yizhou7)
* somewhat messy but almost done with bit_rev in riscv (yizhou7)
* removed assumes in bit rev proofs (yizhou7)
* removed one assume (yizhou7)
* bit rev model somewhat working (yizhou7)
* some progress (yizhou7)
* add b as part of the datatype (yizhou7)
* some progress (yizhou7)
* bit rev shuffle proof more complicated than expected (yizhou7)
* inverse ntt (yizhou7)
* rename invs to avoid conflicts (yizhou7)
* directory reorg (yizhou7)
* updated the riscv impl (yizhou7)
* remove large amount of duplications in ntt and intt (yizhou7)
* intt dafny model and proof (yizhou7)
* Update names of OTBN build scripts (Jade Philipoom)
* rename inner mq sub add (yizhou7)
* use mq_add mq_sub with fixed regs (yizhou7)
* forward ntt seems to work (yizhou7)
* add post lemma for t_inv (yizhou7)
* fixed unsound requires (yizhou7)
* inner most loop seems done (yizhou7)
* annoying bound proofs (yizhou7)
* wip on s_loop (yizhou7)
* wip for s_loop (yizhou7)
* update dafny impl (yizhou7)
* wip (yizhou7)
* reorg innt model (yizhou7)
* fix up the rsa code (yizhou7)
* worked around werid dafny asbtract module bug (yizhou7)
* add support for b16 iter wirte (yizhou7)
* add some support for b16_iter (yizhou7)
* Update README.md to include Singular requirement (Sydney Gibson)
* Add subm ins to OTBN and clean up files (Sydney Gibson)
* Fix mq_add to not use hardcoded modulus (Sydney Gibson)
* mq_add done with assumed Q=12289 in wmod (Sydney Gibson)
* remove assume about Q - x_e (yizhou7)
* addm fully implemented plus correctness lemma (Sydney Gibson)
* Get otbn mq_arith test-ready (Sydney Gibson)
* intt (with maybe one assume) (yizhou7)
* revamp forward ntt so that things are pretty much all parameterized
  (yizhou7)
* structurally very similar invs for innt (yizhou7)
* include the command ran (yizhou7)
* some basic comparsion (yizhou7)
* remove the need to pass in psi (yizhou7)
* much cleaner python references (yizhou7)
* Add otbn addm instruction (Sydney Gibson)
* our alternative version seems to work (yizhou7)
* mostly got the alternative version working (yizhou7)
* update the dafny impl to use the table (yizhou7)
* separate table (yizhou7)
* update mulntt_ct to use montmul and adjusted twiddle factors
  (yizhou7)
* update python script to use twiddle factors in montgomery form
  (yizhou7)
* cleanup python scripts (yizhou7)
* prove mq_rshift1 procedure and lemma (Sydney Gibson)
* test out intt using the same ct code (yizhou7)
* removed most assumes (yizhou7)
* removed assume in x_value_odd_square_lemma (yizhou7)
* remove assume in x_value_even_square_lemma (yizhou7)
* reorg bit_rev_int (yizhou7)
* removed some assumes (yizhou7)
* a version with many assumes (yizhou7)
* done proving j_loop_inv_pre_lemma/j_loop_inv_post_lemma (yizhou7)
* next_t_loop_view (yizhou7)
* proved build_higher_inverse_lemma (yizhou7)
* s_loop_inv pre/post lemma (yizhou7)
* stronger s_loop_perserves_lower_inv_lemma (yizhou7)
* s_loop_perserves_inv_lemma (yizhou7)
* proved s_loop_perserves_higher_inv_lemma (yizhou7)
* proved the core butterfly ops (with some assumes) (yizhou7)
* replaced assume about x_square (yizhou7)
* remove two assumes about bitrev (yizhou7)
* proved lower_points_view_value_lemma (yizhou7)
* points_view_index_lemmas (yizhou7)
* cleanups on level_polys & level_points_view (yizhou7)
* even more completed invs (yizhou7)
* start porting invs to dafny (yizhou7)
* more complete invs (yizhou7)
* some progress on the polysplit (yizhou7)
* starting the dafny loop version (yizhou7)
* some more inv figured out (yizhou7)
* got some inv figured out (yizhou7)
* inv guess seems to hold for the last level at least (yizhou7)
* testing out invs on python code (yizhou7)
* redo nth root lemma (yizhou7)
* sanity checking python (yizhou7)
* progress on mq_rshift1 proof (Sydney Gibson)
* add neg instruction to risc-v model (Sydney Gibson)
* add mq_sub and proof (Sydney Gibson)
* mq_add proof without assert false (Sydney Gibson)
* Working proof for mq_add (Sydney Gibson)
* remove assume in ntt_rec (yizhou7)
* remove assumes in ntt (yizhou7)
* import fix (yizhou7)
* file renames (yizhou7)
* a cleaner version of ntt_rec (yizhou7)
* uncomment post condition (yizhou7)
* remove one assume (yizhou7)
* fix error in ntt_rec (yizhou7)
* almost done with loop version (yizhou7)
* almost getting ntt working (yizhou7)
* first version of ntt_chunk_loop (yizhou7)
* ntt_chunk_loop seems to work (yizhou7)
* making progress on proving ntt_chunk_loop_inv (yizhou7)
* remove additional even/odd args (yizhou7)
* proved that ntt_level_implies_chunk_inv (yizhou7)
* restructure indicies computations (yizhou7)
* seems to have worked around the timeout issue (yizhou7)
* encounter timeout in loop ntt (yizhou7)
* cleaner version of ntt_rec3 (yizhou7)
* Partial progress on mq_add proof (Sydney Gibson)
* add chunk view functions (yizhou7)
* use ntt_rec3 as a reference (yizhou7)
* reduce the number of arguments passed around (yizhou7)
* ntt_rec2_chunk_rec seems to work (yizhou7)
* getting close to a level recursive version (yizhou7)
* convert ntt_rec2 into using ghost a (yizhou7)
* better post conditions for ntt_indicies_inv_consequence (yizhou7)
* incoperate indicies into rec2 (yizhou7)
* incoperate ntt_indicies_inv_consequence into ntt_rec2 (yizhou7)
* Clean up helpers (Bryan Parno)
* Helper goes through, modulo a few small helpers (Bryan Parno)
* a functional and recursive ntt_rec (yizhou7)
* add level chunk functions (yizhou7)
* Progress on the base case. (Bryan Parno)
* Induction step goes through for poly_eval_split_helper (Bryan Parno)
* More progress (Bryan Parno)
* ntt_chunk_base_case (yizhou7)
* rename chunk index into ki to better match the loop version
  (yizhou7)
* Experiment (Bryan Parno)
* fix up errors in pow2 (yizhou7)
* remove indicies in ntt_rec (yizhou7)
* ntt_indicies_inv_consequence seems done (yizhou7)
* Progress on poly_eval_split_lemma_helper (Bryan Parno)
* odd_indexed_indices_reorder symmetric (yizhou7)
* fix up index.py after Pow2 removal (yizhou7)
* a stronger inv for the bit reverse (yizhou7)
* Restore ntt_cancellation_lemma proof (Bryan Parno)
* Tediously prove nth_root_lemma() (Bryan Parno)
* making some progress on even index sorting (yizhou7)
* making some progress on bit rev (yizhou7)
* split files (yizhou7)
* at leat prove that the indicies are in bound (yizhou7)
* leave poly_eval_split_lemma as axiom (yizhou7)
* cleanup the odd even defs (yizhou7)
* make ntt_base_case assumption more explicit (yizhou7)
* cleanup the axioms a bit (yizhou7)
* Some proofs for the axioms (Bryan Parno)
* oops more typos (yizhou7)
* fix the N constant error (yizhou7)
* remove some assumes (yizhou7)
* naive forward ntt (yizhou7)
* some progress on y_k_value (yizhou7)
* ntt_base_case (yizhou7)
* start of naive ntt (yizhou7)
* Add stand-alone falcon sigverify routine + riscv asm (Sydney Gibson)
* mod_exp seems done (yizhou7)
* progress on mp post (yizhou7)
* more progress on mod_exp (yizhou7)
* progress on mp_loop (yizhou7)
* forgot to add vad file (yizhou7)
* mont_mul seems done (yizhou7)
* mont_mul_add seems done (yizhou7)
* progress on mma_post (yizhou7)
* mma inner loop (yizhou7)
* main mma_loop body seems done (yizhou7)
* progress on the main loop body (yizhou7)
* example usage of stack frame var in mma_pre (yizhou7)
* partial progress on mma (yizhou7)
* make stack push more opaque (yizhou7)
* update sub_mod to load symbolic n (yizhou7)
* update source to use n and d0inv as externs (yizhou7)
* add load symbol (yizhou7)
* mark consts_inv as opaque (yizhou7)
* add contants for heap (yizhou7)
* sub_mod seems to be okay (yizhou7)
* bug fix in subc (order of args wrong) (yizhou7)
* use modfies anyways (yizhou7)
* bug fix on state equal (yizhou7)
* use frame false for mula and mulaa (yizhou7)
* mul example (yizhou7)
* fix typo in mul name (yizhou7)
* moved examples (yizhou7)
* most of the instructions (yizhou7)
* add semantics for popm_w (yizhou7)
* reduce the number of post conditions in sw_stack (yizhou7)
* sw/lw with iters (yizhou7)
* add flags to msp state (yizhou7)
* add 16 bits based source file for msp modexp (yizhou7)
* add pushm_w semantics (yizhou7)
* add vale.i.dfy for msp (yizhou7)
* move reference (yizhou7)
* add starting defs for msp430 (yizhou7)
* add source for msp430 (yizhou7)
* cleanup build, mark stack_init as extern (yizhou7)
* fix up otbn ptr issue (yizhou7)
* add lw_prev (yizhou7)
* mostly fixed up mp (yizhou7)
* mm seems to work (yizhou7)
* revamped mma (yizhou7)
* mma seems to work (yizhou7)
* basic use case of save registers in mma (yizhou7)
* mostly fixed up the decls in rv (yizhou7)
* Update dependencies to include those needed for Vale, check those
  dependencies, and link to Vale install instructions (Jade Philipoom)
* update riscv vale.i.dfy (yizhou7)
* cleanup some duplicated defs (yizhou7)
* patch up a version using generated memory models (yizhou7)
* rename some files (yizhou7)
* template files (yizhou7)
* mem_t imem_t refinement (yizhou7)
* slightly change the inv proof interface (yizhou7)
* verify manual written files (yizhou7)
* most part of write_preserves_inv (yizhou7)
* stack model with frames (yizhou7)
* an untemplated version (yizhou7)
* 2/3 equiv invs (yizhou7)
* attempting to prove stack inv (yizhou7)
* commit a hand written version for stack modeling first (yizhou7)
* small write preserves inv (yizhou7)
* mostly done with 2 width (yizhou7)
* fix some typos (yizhou7)
* use of jinja macro (yizhou7)
* move script dir (yizhou7)
* remove rr lemma that is no longer needed (yizhou7)
* update build system (yizhou7)
* bug fix in montmul_inv_lemma (yizhou7)
* update mul256_correct_lemma (yizhou7)
* mostly done with simplification using gbassert (yizhou7)
* converted most gbasserts (yizhou7)
* remove unused and redundant code (Sydney Gibson)
* update the mont_loop_divisible_lemma proof to use gbassert (yizhou7)
* use a mod version of dafny (yizhou7)
* emit comment in modexp_printer (yizhou7)
* Finish NOP test suite and new printer config (Sydney Gibson)
* Add test cases for empty constructs and function calls (Sydney
  Gibson)
* Fix IfElse test cases(?) (Sydney Gibson)
* Working test set up + a few working NOP tests (Sydney Gibson)
* An alternate approach to overlap detection (Bryan Parno)
* simplify for testing purposes, fix printer call (Sydney Gibson)
* Another attempt at nop detection (Sydney Gibson)
* update the build slightly, add riscv printer output (yizhou7)
* Working nop checker (Sydney Gibson)
* update the dafny lib hash (yizhou7)
* add nop eval (Sydney Gibson)
* Get rid of warnings in make file (Sydney Gibson)
* remove duplicated files (yizhou7)
* seem be have located and fixed the printer branch bug (yizhou7)
* fix the skipping in ifelse (yizhou7)
* Add runnable riscv-rsa test assembly + harness (Sydney Gibson)
* Start NOP removal (Sydney Gibson)
* update printer (yizhou7)
* remove constants addresses from riscv impl (yizhou7)
* separate mod_pow and main (yizhou7)

Signed-off-by: Jade Philipoom <jadep@google.com>
diff --git a/sw/vendor/secure-foundations_veri-titan.lock.hjson b/sw/vendor/secure-foundations_veri-titan.lock.hjson
index 2e75266..5c37a25 100644
--- a/sw/vendor/secure-foundations_veri-titan.lock.hjson
+++ b/sw/vendor/secure-foundations_veri-titan.lock.hjson
@@ -9,6 +9,6 @@
   upstream:
   {
     url: https://github.com/secure-foundations/veri-titan.git
-    rev: 6e0137bbbebf851518bd7121d01c986c43ab552d
+    rev: a5b18fa0bea5d943c9267d3561cbd2704764a868
   }
 }
diff --git a/sw/vendor/veri-titan/gen/otbn_modexp.s b/sw/vendor/veri-titan/gen/otbn_modexp.s
index 3b47554..61f96d5 100644
--- a/sw/vendor/veri-titan/gen/otbn_modexp.s
+++ b/sw/vendor/veri-titan/gen/otbn_modexp.s
@@ -1,23 +1,65 @@
 /*
-  veri-titan commit hash: 47792932f788b92221ff61e4037811a867e747c7
+  This code is generated by the veri-titan project: https://github.com/secure-foundations/veri-titan
 */
-
 .globl modexp_var_3072_f4
 modexp_var_3072_f4:
+
+  /**
+   * Variable time 3072-bit modular exponentiation with exponent 65537
+   *
+   * Returns: C = modexp(A,65537) = mod M
+   *
+   * This implements the square and multiply algorithm for the
+   * F4 exponent (65537).
+   *
+   * The squared Montgomery modulus RR and the Montgomery constant m0' have to
+   * be provided at the appropriate locations in dmem. DMEM locations are
+   * expected to be disjoint.
+   *
+   * Flags: Flags have no meaning beyond the scope of this subroutine.
+   *
+   * The base bignum A is expected in the input buffer, the result C is written
+   * to the output buffer.
+   *
+   * @param[in]  dmem[x17] pointer to m0' in dmem
+   * @param[in]  dmem[x26] pointer to RR in dmem
+   * @param[in]  dmem[x16] pointer to first limb of modulus M in dmem
+   * @param[in]  dmem[x23] pointer to buffer with base bignum
+   * @param[in]  dmem[x24] pointer to output buffer
+   */
+
+  /* Prepare all-zero register. */
   bn.xor w31, w31, w31 << 0, FG0
+
+  /* Prepare pointers to temporary registers. */
   li x8, 4
   li x9, 3
   li x10, 4
   li x11, 2
+
+  /* Initialize a counter to double-check that the loop completes (a
+      protection against fault injection). */
+  li x5, 0
+
+  /* Convert input to Montgomery domain.
+       [w15:w4] <= (dmem[x23]*dmem[x26]*R^-1) mod M = (A * R) mod M */
   addi x19, x23, 0
   addi x20, x26, 0
   addi x21, x24, 0
   jal x1, montmul
+
+  /* Store Montgomery-form input in dmem.
+       dmem[x24] <= [w15:w4] = (A * R) mod M */
   loopi 12, 3
     bn.sid x8, 0(x21++)
     addi x8, x8, 1
-    nop
+    addi x5, x5, 1
+
+  /* 16 consecutive Montgomery squares on the outbut buffer, i.e. after loop:
+       dmem[out_buf] <= dmem[out_buf]^65536*R mod M */
   loopi 16, 9
+
+    /* dmem[out_buf]  <= montmul(dmem[out_buf], dmem[out_buf]) */
     addi x19, x24, 0
     addi x20, x24, 0
     addi x21, x24, 0
@@ -25,109 +67,282 @@
     loopi 12, 3
       bn.sid x8, 0(x21++)
       addi x8, x8, 1
-      nop
-    nop
+      addi x5, x5, 1
+
+    /* Update counter. */
+    addi x5, x5, 1
+
+  /* Final multiplication and conversion of result from Montgomery domain.
+       out_buf  <= montmul(*x28, *x20) = montmul(dmem[in_buf], dmem[out_buf]) */
   addi x19, x23, 0
   addi x20, x24, 0
   addi x21, x24, 0
   jal x1, montmul
+
+  /* Final conditional subtraction of modulus if mod >= dmem[out_buf]. */
+
+  /* Speculatively subtract modulus and store in separate registers.
+       [w28:w17] <= (dmem[out_buf] - M) mod 2^3072 */
   bn.add w31, w31, w31 << 0, FG0
   li x17, 16
-  loopi 12, 5
+  loopi 12, 4
     bn.movr x11, x8++
     bn.lid x9, 0(x16++)
     bn.subb w2, w2, w3 << 0, FG0
     bn.movr x17++, x11
-    nop
+
+  /* Check the borrow flag and select the post-subtraction version iff the
+     subtraction underflowed. */
   csrrs x2, 1984, x0
   andi x2, x2, 1
   li x8, 4
   bne x2, x0, label_0
   li x8, 16
   label_0:
+
+  /* Store result in dmem: dmem[out_buf] <= A^65537 mod M */
   addi x21, x24, 0
   loopi 12, 3
     bn.sid x8, 0(x21++)
     addi x8, x8, 1
-    nop
+    addi x5, x5, 1
+
+  /* If the counter value doesn't match expectations, cause a deliberate
+     error (WDR reference > 31) to end the program. */
+  li x2, 232
+  beq x2, x5, label_1
+  li x2, 255
+  bn.sid x0, 0(x2)
+  label_1:
   ret
 
 .globl montmul
 montmul:
+
+  /**
+   * Variable-time 3072-bit Montgomery Modular Multiplication
+   *
+   * Returns: C = montmul(A,B) = A*B*R^(-1) mod M
+   *
+   * This implements the limb-by-limb interleadved Montgomory Modular
+   * Multiplication Algorithm. This is only a wrapper around the main loop body.
+   * For algorithmic implementation details see the mont_loop subroutine.
+   *
+   * Flags: Flags have no meaning beyond the scope of this subroutine.
+   *
+   * @param[in]  x16: dptr_M, dmem pointer to first limb of modulus M
+   * @param[in]  x17: dptr_m0d, dmem pointer to Montgomery Constant m0'
+   * @param[in]  x19: dptr_a, dmem pointer to first limb of operand A
+   * @param[in]  x20: dptr_b, dmem pointer to first limb of operand B
+   * @param[in]  w31: all-zero
+   * @param[in]  x9: pointer to temp reg, must be set to 3
+   * @param[in]  x10: pointer to temp reg, must be set to 4
+   * @param[in]  x11: pointer to temp reg, must be set to 2
+   * @param[out] [w15:w4]: result C
+   */
+
+  /* Load Montgomery constant: w3 <= m0' */
   bn.lid x9, 0(x17)
+
+  /* Initialize result buffer with zeroes. */
   bn.mov w2, w31
-  loopi 12, 2
+  loopi 12, 1
     bn.movr x10++, x11
-    nop
-  loopi 12, 7
+
+  /* Iterate over limbs of input operand. */
+  loopi 12, 6
+
+    /* Load limb of operand. */
     bn.lid x11, 0(x20++)
+
+    /* Save some register values. */
     addi x6, x16, 0
     addi x7, x19, 0
+
+    /* Main loop body of Montgomery multiplication algorithm. */
     jal x1, mont_loop
+
+    /* Restore registers. */
     addi x16, x6, 0
     addi x19, x7, 0
-    nop
+
+  /* Restore pointers. */
   li x8, 4
   li x10, 4
   ret
 
 mont_loop:
+
+  /**
+  * Main loop body for variable-time 3072-bit Montgomery Modular Multiplication
+  *
+  * Returns: C <= (C + A*b_i + M*m0'*(C[0] + A[0]*b_i))/(2^WLEN) mod R
+  *
+  * This implements the main loop body for the Montgomery Modular Multiplication
+  * as well as the conditional subtraction. See e.g. Handbook of Applied
+  * Cryptography (HAC) 14.36 (steps 2.1 and 2.2) and step 3.
+  * This subroutine has to be called for every iteration of the loop in step 2
+  * of HAC 14.36, i.e. once per limb of operand B (x in HAC notation). The limb
+  * is supplied via w2. In the comments below, the index i refers to the
+  * i_th call to this subroutine within one full Montgomery Multiplication run.
+  * Step 3 of HAC 14.36 is replaced by the approach to perform the conditional
+  * subtraction when the intermediate result is larger than R instead of m. See
+  * e.g. https://eprint.iacr.org/2017/1057 section 2.4.2 for a justification.
+  * This does not omit the conditional subtraction.
+  * Variable names of HAC are mapped as follows to the ones used in the
+  * this library: x=B, y=A, A=C, b=2^WLEN, m=M, R=R, m' = m0', n=N.
+  *
+  * Flags: The states of both FG0 and FG1 depend on intermediate values and are
+  *        not usable after return.
+  *
+  * @param[in]  x16: dmem pointer to first limb of modulus M
+  * @param[in]  x19: dmem pointer to first limb operand A
+  * @param[in]  w2:  current limb of operand B, b_i
+  * @param[in]  w3:  Montgomery constant m0'
+  * @param[in]  w31: all-zero
+  * @param[in]  [w15:w4] intermediate result A
+  * @param[out] [w15:w4] intermediate result A
+  *
+  */
+
+  /* Save pointer to modulus. */
   addi x22, x16, 0
+
+  /* Pointers to temporary registers. */
   li x12, 30
   li x13, 24
+
+  /* Buffer read pointer. */
   li x8, 4
+
+  /* Buffer write pointer. */
   li x10, 4
+
+  /* Load 1st limb of input y (operand a): w30 = y[0] */
   bn.lid x12, 0(x19++)
+
+  /* [w26, w27] = w30*w2 = y[0]*xi */
   jal x1, mul256_w30xw2
+
+  /* w24 = w4 = A[0] */
   bn.movr x13, x8++
+
+  /* add A[0]: [w29, w30] = [w26, w27] + w24 = y[0]*xi + A[0] */
   bn.add w30, w27, w24 << 0, FG0
   bn.addc w29, w26, w31 << 0, FG0
+
+  /* w25 = w3 = m0d */
   bn.mov w25, w3
+
+  /* [_, ui] = [w26, w27] = w30*w25 = (y[0]*xi + A[0])*m0d*/
   jal x1, mul256_w30xw25
+
+  /* [_, ui] = [w28, w25] = [w26, w27]  */
   bn.mov w25, w27
+
+  /* w24 <= w30 =  y[0]*xi + A[0] mod b */
   bn.mov w24, w30
+
+  /* Load first limb of modulus: w30 <= m[0]. */
   bn.lid x12, 0(x16++)
+
+  /* [w26, w27] = w30*w25 = m[0]*ui */
   jal x1, mul256_w30xw25
+
+  /* [w28, w27] <= [w26, w27] + w24 = m[0]*ui + (y[0]*xi + A[0] mod b) */
   bn.add w27, w27, w24 << 0, FG0
   bn.addc w28, w26, w31 << 0, FG0
-  loopi 11, 15
+
+  /* This loop implements step 2.2 of HAC 14.36 with a word-by-word approach.
+     The loop body is subdivided into two steps. Each step performs one
+     multiplication and subsequently adds two WLEN sized words to the
+     2WLEN-sized result, such that there are no overflows at the end of each
+     step-
+     Two carry words are required between the cycles. Those are c_xy and c_m.
+     Assume that the variable j runs from 1 to N-1 in the explanations below.
+     A cycle 0 is omitted, since the results from the computations above are
+     re-used */
+  loopi 11, 14
+
+    /* Load limb of y (operand a): w30 <= y[j] */
     bn.lid x12, 0(x19++)
+
+    /* Load limb of buffer: w24 <= A[j] */
     bn.movr x13, x8++
+
+    /* Multiply y[j]*xi, add A[j], and add carry from previous iteration:
+         [w26, w27] <= w30*w2 + w24 + w29 = y[j]*x_i + A[j] + c_xy */
     jal x1, mul256_w30xw2
     bn.add w27, w27, w24 << 0, FG0
     bn.addc w26, w26, w31 << 0, FG0
     bn.add w24, w27, w29 << 0, FG0
     bn.addc w29, w26, w31 << 0, FG0
+
+    /* Step 2:  Second multiplication computes the product of a limb m[j] of
+       the modulus with u_i. The 2nd carry word from the previous loop cycle
+       c_m and the lower word a_tmp of the result of Step 1 are added. */
+
+    /* Load limb m[j] of modulus: w30 <= m[j] */
     bn.lid x12, 0(x16++)
+
+    /* Multiply with ui and add result from first step:
+         [w28, w24] <= w30*w25 + w24 + w28 = m[j]*ui + a_tmp + c_m */
     jal x1, mul256_w30xw25
     bn.add w27, w27, w24 << 0, FG0
     bn.addc w26, w26, w31 << 0, FG0
     bn.add w24, w27, w28 << 0, FG0
     bn.addc w28, w26, w31 << 0, FG0
+
+    /* store at w[4+j] = A[j-1]
+       This includes the reduction by 2^WLEN = 2^b in step 2.2 of HAC 14.36 */
     bn.movr x10++, x13
-    nop
+
+  /* Most significant limb of A is sum of the carry words of last loop cycle.
+     A[N-1] = w24 <- w29 + w28 = c_xy + c_m */
   bn.add w24, w29, w28 << 0, FG1
   bn.movr x10++, x13
+
+  /* Clear flag group 0. */
   bn.add w31, w31, w31 << 0, FG0
+
+  /* The below replaces Step 3 of HAC 14.36 and performs conditional
+     subtraction of the modulus from the output buffer.  */
+
+  /* Read carry flag 1 into a register: x2 <= FG1.C */
   csrrs x2, 1985, x0
   andi x2, x2, 1
-  beq x2, x0, label_1
+
+  /* Subtract modulus if the carry was 1; otherwise skip. */
+  beq x2, x0, label_2
   li x12, 30
   li x13, 24
   addi x16, x22, 0
   li x8, 4
-  loopi 12, 5
+  loopi 12, 4
     bn.lid x13, 0(x16++)
     bn.movr x12, x8
     bn.subb w24, w30, w24 << 0, FG0
     bn.movr x8++, x13
-    nop
-  label_1:
+  label_2:
+
+  /* Restore pointers. */
   li x8, 4
   li x10, 4
   ret
 
 mul256_w30xw2:
+
+  /**
+   * Unrolled 512=256x256 bit multiplication.
+   *
+   * Returns c = a x b.
+   *
+   * Flags: No flags are set in this subroutine
+   *
+   * @param[in] w30: a, first operand
+   * @param[in] w2: b, second operand
+   * @param[out] [w26, w27]: c, result
+   */
   bn.mulqacc.z w30.0, w2.0, 0
   bn.mulqacc w30.1, w2.0, 64
   bn.mulqacc.so w27.L, w30.0, w2.1, 64, FG0
@@ -147,6 +362,18 @@
   ret
 
 mul256_w30xw25:
+
+  /**
+   * Unrolled 512=256x256 bit multiplication.
+   *
+   * Returns c = a x b.
+   *
+   * Flags: No flags are set in this subroutine
+   *
+   * @param[in] w30: a, first operand
+   * @param[in] w25: b, second operand
+   * @param[out] [w26, w27]: c, result
+   */
   bn.mulqacc.z w30.0, w25.0, 0
   bn.mulqacc w30.1, w25.0, 64
   bn.mulqacc.so w27.L, w30.0, w25.1, 64, FG0
diff --git a/sw/vendor/veri-titan/gen/riscv_modexp.s b/sw/vendor/veri-titan/gen/riscv_modexp.s
new file mode 100644
index 0000000..f52822a
--- /dev/null
+++ b/sw/vendor/veri-titan/gen/riscv_modexp.s
@@ -0,0 +1,300 @@
+/*
+  This code is generated by the veri-titan project: https://github.com/secure-foundations/veri-titan
+
+  The mod_pow assembly snippet expects arguments in the following way:
+
+  a0:
+      @param d0inv      Precomputed Montgomery constant, considered part of key d0inv=-n^(-1) mod R
+
+  a1: 
+      @param out        Output message as little-endian array
+
+  a2:
+      @param workbuf32  Work buffer, caller must verify this is 2 x RSANUMWORDS elements long.
+
+  a3:
+      @param rr         Precomputed constant, (R*R) mod n, considered part of key
+
+  a4:
+      @param n          Modulus of key
+
+  a5:
+      @param in         Input signature as little-endian array
+
+  It should correspond to this C signature:
+
+  void mod_pow(const uint32_t d0inv,
+          uint32_t *out,
+          uint32_t *workbuf32,
+          const uint32_t * rr,
+          const uint32_t *n,
+          uint32_t *in)
+*/
+.globl mod_pow
+mod_pow:
+  addi sp, sp, -32
+  sw ra, 28(sp)
+  sw s0, 24(sp)
+  sw s1, 20(sp)
+  sw s2, 16(sp)
+  sw s3, 12(sp)
+  sw s4, 8(sp)
+  sw s5, 4(sp)
+  sw s6, 0(sp)
+  addi s0, a2, 0
+  addi s6, a4, 0
+  addi s3, a0, 0
+  addi s5, a5, 0
+  addi s4, a1, 0
+  addi s2, a2, 384
+  addi a2, a3, 0
+  addi a1, s0, 0
+  addi a3, s5, 0
+  call mont_mul
+  li s1, 8
+
+w_start0:
+  bleu s1, x0, w_end0
+  addi a3, s0, 0
+  addi a2, s0, 0
+  addi a1, s2, 0
+  addi a0, s3, 0
+  addi a4, s6, 0
+  call mont_mul
+  addi a3, s2, 0
+  addi a2, s2, 0
+  addi a1, s0, 0
+  addi a0, s3, 0
+  addi a4, s6, 0
+  call mont_mul
+  addi s1, s1, -1
+  j w_start0
+
+w_end0:
+  addi a0, s3, 0
+  addi a3, s5, 0
+  addi a2, s0, 0
+  addi a1, s4, 0
+  addi a4, s6, 0
+  call mont_mul
+  addi a0, s4, 0
+  addi a1, s6, 0
+  call ge_mod
+  beq a0, x0, if_true1
+  j if_end1
+
+if_true1:
+  addi a0, s4, 0
+  addi a1, s6, 0
+  call sub_mod
+
+if_end1:
+  lw ra, 28(sp)
+  lw s0, 24(sp)
+  lw s1, 20(sp)
+  lw s2, 16(sp)
+  lw s3, 12(sp)
+  lw s4, 8(sp)
+  lw s5, 4(sp)
+  lw s6, 0(sp)
+  addi sp, sp, 32
+  ret
+
+mont_mul:
+  addi sp, sp, -28
+  sw ra, 24(sp)
+  sw s0, 20(sp)
+  sw s1, 16(sp)
+  sw s2, 12(sp)
+  sw s3, 8(sp)
+  sw s4, 4(sp)
+  sw s5, 0(sp)
+  addi s0, a0, 0
+  addi s1, a1, 0
+  addi s2, a2, 0
+  addi s3, a3, 0
+  addi s5, a4, 0
+  addi s4, s1, 384
+
+w_start2:
+  bgeu s1, s4, w_end2
+  sw x0, 0(s1)
+  addi s1, s1, 4
+  j w_start2
+
+w_end2:
+  addi s1, a1, 0
+  addi s4, s2, 384
+
+w_start3:
+  bgeu s2, s4, w_end3
+  addi a1, s1, 0
+  addi a0, s0, 0
+  addi a3, s3, 0
+  addi a4, s5, 0
+  lw a2, 0(s2)
+  call mont_mul_add
+  addi s2, s2, 4
+  j w_start3
+
+w_end3:
+  lw ra, 24(sp)
+  lw s0, 20(sp)
+  lw s1, 16(sp)
+  lw s2, 12(sp)
+  lw s3, 8(sp)
+  lw s4, 4(sp)
+  lw s5, 0(sp)
+  addi sp, sp, 28
+  ret
+
+mont_mul_add:
+  addi sp, sp, -40
+  sw ra, 36(sp)
+  sw s0, 32(sp)
+  sw s1, 28(sp)
+  sw s2, 24(sp)
+  sw s3, 20(sp)
+  sw s4, 16(sp)
+  sw s5, 12(sp)
+  sw s6, 8(sp)
+  sw s7, 4(sp)
+  sw s8, 0(sp)
+  addi s6, a1, 0
+  lw a1, 0(a3)
+  addi s7, a2, 0
+  lw a2, 0(s6)
+  addi s5, a0, 0
+  addi a0, s7, 0
+  addi s4, a3, 0
+  call mula32
+  mul s5, a0, s5
+  addi s8, a4, 0
+  addi s0, a1, 0
+  lw a1, 0(s8)
+  addi a2, a0, 0
+  addi s2, s8, 4
+  addi s4, s4, 4
+  addi s3, s6, 0
+  addi s8, s8, 384
+  addi a0, s5, 0
+  call mula32
+  addi s1, a1, 0
+
+w_start4:
+  bgeu s2, s8, w_end4
+  lw a2, 4(s3)
+  lw a1, 0(s4)
+  addi a3, s0, 0
+  addi a0, s7, 0
+  call mulaa32
+  addi s0, a1, 0
+  lw a1, 0(s2)
+  addi a2, a0, 0
+  addi a3, s1, 0
+  addi a0, s5, 0
+  call mulaa32
+  sw a0, 0(s3)
+  addi s2, s2, 4
+  addi s1, a1, 0
+  addi s4, s4, 4
+  addi s3, s3, 4
+  j w_start4
+
+w_end4:
+  add s0, s0, s1
+  sw s0, 0(s3)
+  bltu s0, s1, if_true5
+  j if_end5
+
+if_true5:
+  addi a0, s6, 0
+  addi a1, s2, -384
+  call sub_mod
+
+if_end5:
+  lw ra, 36(sp)
+  lw s0, 32(sp)
+  lw s1, 28(sp)
+  lw s2, 24(sp)
+  lw s3, 20(sp)
+  lw s4, 16(sp)
+  lw s5, 12(sp)
+  lw s6, 8(sp)
+  lw s7, 4(sp)
+  lw s8, 0(sp)
+  addi sp, sp, 40
+  ret
+
+mula32:
+  mul a5, a0, a1
+  mulhu a1, a0, a1
+  add a0, a5, a2
+  sltu a5, a0, a5
+  add a1, a1, a5
+  ret
+
+mulaa32:
+  mul a5, a0, a1
+  mulhu a1, a0, a1
+  add a0, a5, a2
+  sltu a5, a0, a5
+  add a1, a1, a5
+  add a0, a0, a3
+  sltu a5, a0, a3
+  add a1, a1, a5
+  ret
+
+sub_mod:
+  addi a2, a1, 0
+  addi a6, a2, 384
+  li a5, 0
+  li a1, 0
+
+w_start6:
+  beq a2, a6, w_end6
+  lw a4, 0(a0)
+  lw a3, 0(a2)
+  addi a2, a2, 4
+  add a5, a5, a4
+  sub a3, a5, a3
+  sltu a4, a5, a4
+  add a4, a4, a1
+  sltu a5, a5, a3
+  sw a3, 0(a0)
+  addi a0, a0, 4
+  sub a5, a4, a5
+  srai a1, a5, 31
+  j w_start6
+
+w_end6:
+  ret
+
+ge_mod:
+  addi a0, a0, 380
+  addi a5, a1, 380
+  addi a2, x0, 1
+
+w_start7:
+  beq a2, x0, w_end7
+  lw a3, 0(a0)
+  lw a4, 0(a5)
+  sub a2, a3, a4
+  sltu a3, a3, a4
+  sltu a4, x0, a2
+  xor a2, a1, a5
+  bne a4, x0, if_true8
+  j if_end8
+
+if_true8:
+  add a2, x0, x0
+
+if_end8:
+  addi a0, a0, -4
+  addi a5, a5, -4
+  j w_start7
+
+w_end7:
+  addi a0, a3, 0
+  ret
+