[mask_rom, sig_verify] Add Montgomery multiplication

This change adds a Montgomery multiplication implementation based on
Alg. 14.36 in Handbook of Applied Cryptography.

Signed-off-by: Alphan Ulusoy <alphan@google.com>
diff --git a/sw/device/silicon_creator/mask_rom/rsa_verify.c b/sw/device/silicon_creator/mask_rom/rsa_verify.c
new file mode 100644
index 0000000..9974342
--- /dev/null
+++ b/sw/device/silicon_creator/mask_rom/rsa_verify.c
@@ -0,0 +1,80 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sw/device/silicon_creator/mask_rom/rsa_verify.h"
+
+#include <memory.h>
+#include <stddef.h>
+
+/**
+ * Subtracts `b` from `a` in-place, i.e. `a -= b`.
+ *
+ * `a` must be greater than or equal to `b`.
+ *
+ * See Handbook of Applied Cryptography, Ch. 14, Alg. 14.9.
+ *
+ * @param a A `kRsaNumWords` long buffer, little-endian.
+ * @param b A `kRsaNumWords` long buffer, little-endian.
+ */
+static void subtract(uint32_t *a, const uint32_t *b) {
+  uint32_t borrow = 0;
+  for (size_t i = 0; i < kRsaNumWords; ++i) {
+    uint64_t diff = (uint64_t)a[i] - b[i] - borrow;
+    borrow = diff > a[i];
+    a[i] = (uint32_t)diff;
+  }
+}
+
+// FIXME: Merge this comment with the one in the header file.
+// This function implements Alg. 14.36 in Handbook of Applied Cryptography:
+// 1. result = 0
+// 2. For i from 0 to (n - 1):
+// 		2.1. u_i = (result_0 + x_i * y_0) * m' mod b
+// 		2.2. result = (result + x_i * y + u_i * m) / b
+// 3. If result >= m then result = result - m
+// 4. Return result
+void mont_mul(const uint32_t *x, const uint32_t *y, const uint32_t *m,
+              const uint32_t m_prime, uint32_t *result) {
+  memset(result, 0, kRsaNumWords * sizeof(uint32_t));
+
+  for (size_t i = 0; i < kRsaNumWords; ++i) {
+    // The loop below reads one word ahead of writes to avoid a separate loop
+    // for the division by `b` in step 2.2 of the algorithm. Thus, `acc0` and
+    // `acc1` are initialized here before the loop. `acc0` holds the sum of
+    // first two addends in step 2.2 while `acc1` holds the entire sum. Carries
+    // of these operations are stored separately in the upper words of `acc0`
+    // and `acc1`. `acc0` and `acc1` can safely store these intermediate values,
+    // i.e. without wrapping, because UINT32_MAX^2 + 2*UINT32_MAX is
+    // 0xffff_ffff_ffff_ffff.
+
+    // Holds the sum of the first two addends in step 2.2.
+    uint64_t acc0 = (uint64_t)x[i] * y[0] + result[0];
+    const uint32_t u_i = (uint32_t)acc0 * m_prime;
+    // Holds the sum of the all three addends in step 2.2.
+    uint64_t acc1 = (uint64_t)u_i * m[0] + (uint32_t)acc0;
+
+    // Process the i^th digit of `x`, i.e. `x[i]`.
+    for (size_t j = 1; j < kRsaNumWords; ++j) {
+      acc0 = (uint64_t)x[i] * y[j] + result[j] + (acc0 >> 32);
+      acc1 = (uint64_t)u_i * m[j] + (uint32_t)acc0 + (acc1 >> 32);
+      result[j - 1] = (uint32_t)acc1;
+    }
+    acc0 = (acc0 >> 32) + (acc1 >> 32);
+    result[kRsaNumWords - 1] = (uint32_t)acc0;
+
+    // The intermediate result of this algorithm before the check below is
+    // bounded by R + m (Eq. (4) in Montgomery Arithmetic from a Software
+    // Perspective, Bos. J. W, Montgomery, P. L.) where `m` is an integer with
+    // `kRsaNumWords` base 2^32 digits, `R` is 2^(`kRsaNumWords`*32), and m < R.
+    // Therefore, if there is a carry, then `result` is not the least
+    // non-negative residue of x*y*R^-1 mod m. Since `acc0 >> 32` here is at
+    // most 1, we can subtract `m` from `result` without taking it into account
+    // and fit `result` into `kRsaNumWords`. Since this is not a direct
+    // comparison with `m`, the final result is not guaranteed to be the the
+    // least non-negative residue of x*y*R^-1 mod m.
+    if (acc0 >> 32) {
+      subtract(result, m);
+    }
+  }
+}
diff --git a/sw/device/silicon_creator/mask_rom/rsa_verify.h b/sw/device/silicon_creator/mask_rom/rsa_verify.h
new file mode 100644
index 0000000..1f818d2
--- /dev/null
+++ b/sw/device/silicon_creator/mask_rom/rsa_verify.h
@@ -0,0 +1,47 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENTITAN_SW_DEVICE_SILICON_CREATOR_MASK_ROM_RSA_VERIFY_H_
+#define OPENTITAN_SW_DEVICE_SILICON_CREATOR_MASK_ROM_RSA_VERIFY_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+enum {
+  /**
+   * Number of words of the modulus and signature for RSA-3072.
+   */
+  kRsaNumWords = 96,
+};
+
+// FIXME: Make static and move this comment to the source file. This is here
+// just to be able to add a simple test.
+/**
+ * Computes the Montgomery reduction of the product of two integers.
+ *
+ * Given x, y, m, and m', this function computes x*y*R^-1 mod m, where
+ * - x, y, and m are integers with kRsaNumWords base 2^32 digits,
+ * - m' = -m^-1 mod b,
+ * - R is (2^32)^kRsaNumWords, e.g. 2^3072 for RSA-3072, and
+ * - b is 2^32.
+ *
+ * See Handbook of Applied Cryptography, Ch. 14, Alg. 14.36.
+ *
+ * @param x A `kRsaNumWords` long buffer, little-endian.
+ * @param y A `kRsaNumWords` long buffer, little-endian.
+ * @param m A `kRsaNumWords` long buffer, little-endian.
+ * @param m_prime Negative of the multiplicative inverse of m modulo b.
+ * @param[out] result A `kRsaNumWords` long buffer, little-endian.
+ */
+void mont_mul(const uint32_t *x, const uint32_t *y, const uint32_t *m,
+              uint32_t m_prime, uint32_t *result);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // OPENTITAN_SW_DEVICE_SILICON_CREATOR_MASK_ROM_RSA_VERIFY_H_