[otbn] add P-384 ECDSA signature verification
Adds ECDSA signature verification algorithm
for curve P-384 as a dedicated lib. The implemention
is variable-time and is independent from the scalar
multiplication and inversion routines of the signing lib.
However, depends on point addition routines from the
P-384 base lib.
Signed-off-by: Felix Miller <felix.miller@gi-de.com>
diff --git a/sw/otbn/code-snippets/p384_ecdsa_verify_test.s b/sw/otbn/code-snippets/p384_ecdsa_verify_test.s
new file mode 100644
index 0000000..1b34756
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_ecdsa_verify_test.s
@@ -0,0 +1,146 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ * Standalone test for P-384 ECDSA signature verification
+ *
+ * Runs the P-384 ECDSA signature verification algorithm for message, nonce
+ * and private key contained in the .data section below.
+ *
+ * See comment at the end of the file for expected values of result.
+ */
+
+.section .text
+
+p384_ecdsa_verify_test:
+
+ /* set dmem pointer to point to message */
+ la x2, msg
+ la x3, dptr_msg
+ sw x2, 0(x3)
+
+ /* set dmem pointer to point to signature */
+ la x2, sig_r
+ la x3, dptr_r
+ sw x2, 0(x3)
+ la x2, sig_s
+ la x3, dptr_s
+ sw x2, 0(x3)
+
+ /* set dmem pointer to point to public key */
+ la x2, pub_x
+ la x3, dptr_x
+ sw x2, 0(x3)
+ la x2, pub_y
+ la x3, dptr_y
+ sw x2, 0(x3)
+
+ /* set dmem pointer to point to signature verifcation result */
+ la x2, sig_xres
+ la x3, dptr_rnd
+ sw x2, 0(x3)
+
+ /* call ECDSA signature verification subroutine in P-384 lib */
+ jal x1, p384_verify
+
+ /* load signature to wregs for comparison with reference */
+ li x2, 0
+ la x3, sig_xres
+ bn.lid x2++, 0(x3)
+ bn.lid x2, 32(x3)
+
+ ecall
+
+
+.data
+
+/* message */
+msg:
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .word 0x55555555
+ .zero 16
+
+/* signature R */
+sig_r:
+ .word 0xb68c28d8
+ .word 0x2b23ce3a
+ .word 0x9a1a30fc
+ .word 0x56e186cf
+ .word 0x12d35b38
+ .word 0xc16c09de
+ .word 0x0235d77e
+ .word 0x49d29eef
+ .word 0xd3c43053
+ .word 0xb45990db
+ .word 0x7c0d8125
+ .word 0xb2fcf95c
+ .zero 16
+
+/* signature S */
+sig_s:
+ .word 0x24bc1bf9
+ .word 0x752042f5
+ .word 0x98144c27
+ .word 0x77e415a1
+ .word 0xa78101eb
+ .word 0x0016f9c3
+ .word 0x3e7f6895
+ .word 0x80eb391d
+ .word 0xf19a653d
+ .word 0xfa9554e0
+ .word 0xe34d88c1
+ .word 0x1a72ebdd
+ .zero 16
+
+/* public key x-coordinate */
+pub_x:
+ .word 0x4877f3d1
+ .word 0x7b829460
+ .word 0xb1cac609
+ .word 0x5869de54
+ .word 0xee0e2beb
+ .word 0x6c30f2d8
+ .word 0x47e80661
+ .word 0x394d8b70
+ .word 0xcf60d89e
+ .word 0x1a9ea916
+ .word 0xb439d701
+ .word 0xca230836
+ .zero 16
+
+/* public key y-coordinate */
+pub_y:
+ .word 0xc181f90f
+ .word 0xc31ef079
+ .word 0xbf3aff6e
+ .word 0xc7e55880
+ .word 0xec18818c
+ .word 0xcea028a9
+ .word 0x928c3e92
+ .word 0x82b63bf3
+ .word 0xd65e905d
+ .word 0x68eef2d1
+ .word 0x03afe2c2
+ .word 0xaaafcad2
+ .zero 16
+
+/* signature verification result x_res */
+sig_xres:
+ .zero 64
+
+
+/* Expected values in wide register file (x-coordinate of result):
+ [w1, w0] is x_res == sig_r
+ w0 = 0x49d29eef0235d77ec16c09de12d35b3856e186cf9a1a30fc2b23ce3ab68c28d8
+ w1 = 0x00000000000000000000000000000000b2fcf95c7c0d8125b45990dbd3c43053
+*/
diff --git a/sw/otbn/code-snippets/p384_verify.s b/sw/otbn/code-snippets/p384_verify.s
new file mode 100644
index 0000000..d894097
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_verify.s
@@ -0,0 +1,617 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ * P-384 specific routines for ECDSA signature verification.
+ */
+
+ .section .text
+
+/**
+ * 384-bit variable time modular multiplicative inverse computation
+ *
+ * Returns c <= a^(-1) mod m
+ * where 'a' is a bigint of length 384 bit with a < m
+ * 'm' is the modulus with a length of 384 bit
+ * 'c' is a 384-bit result
+ *
+ * This routine implements the computation of the modular multiplicative
+ * inverse based on the binary GCD or Stein's algorithm.
+ * The implemented variant is based on the "right-shift binary extended GCD"
+ * as it is described in section 3.1 of [1] (Algorithm 1).
+ * [1] https://doi.org/10.1155/ES/2006/32192
+ *
+ * Note that this is a variable time implementation. I.e. this routine will
+ * show a data-dependent timing and execution profile. Only use where a
+ * full white-box scenario is acceptable.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w30, w29]: a, 384-bit operand
+ * @param[in] [w13, w12]: m, modulus
+ * @param[in] w31: all-zero
+ * @param[out] [w17,w16]: result c
+ *
+ * clobbered registers: x2, w2, w4 to w11, w16 to w19
+ * clobbered flag groups: FG0
+ */
+mod_inv_var:
+ /* [w5,w4] = r <= 0 */
+ bn.xor w4, w4, w4
+ bn.xor w5, w5, w5
+
+ /* [w7,w6] = s <= 1 */
+ bn.addi w6, w31, 1
+ bn.xor w7, w7, w7
+
+ /* [w9,w8] = u <= m = [w13, w12]*/
+ bn.mov w8, w12
+ bn.mov w9, w13
+
+ /* [w11,w10] = v <= [w30, w29] */
+ bn.mov w10, w29
+ bn.mov w11, w30
+
+ ebgcd_loop:
+ /* test if u is odd */
+ bn.or w8, w8, w8
+ csrrs x2, 1984, x0
+ andi x2, x2, 4
+ bne x2, x0, ebgcd_u_odd
+
+ /* u is even: */
+ /* [w9,w8] = u <= u/2 = [w9,w8] >> 1 */
+ bn.rshi w8, w9, w8 >> 1
+ bn.rshi w9, w31, w9 >> 1
+
+ /* test if r is odd */
+ bn.or w4, w4, w4
+ csrrs x2, 1984, x0
+ andi x2, x2, 4
+ bne x2, x0, ebgcd_r_odd
+
+ /* r is even: */
+ /* [w5,w4] = r <= r/2 = [w5,w4] >> 1 */
+ bn.rshi w4, w5, w4 >> 1
+ bn.rshi w5, w31, w5 >> 1
+ jal x0, ebgcd_loop
+
+ ebgcd_r_odd:
+ /* [w5,w4] = r <= (r + m)/2 = ([w5,w4] + [w13,w12]) >> 1 */
+ bn.add w4, w4, w12
+ bn.addc w5, w5, w13
+ bn.rshi w4, w5, w4 >> 1
+ bn.rshi w5, w31, w5 >> 1
+ jal x0, ebgcd_loop
+
+ ebgcd_u_odd:
+ /* test if v is odd */
+ bn.or w10, w10, w10
+ csrrs x2, 1984, x0
+ andi x2, x2, 4
+ bne x2, x0, ebgcd_uv_odd
+
+ /* v is even: */
+ /* [w11,w10] = v <= v/2 = [w11,w10] >> 1 */
+ bn.rshi w10, w11, w10 >> 1
+ bn.rshi w11, w31, w11 >> 1
+
+ /* test if s is odd */
+ bn.or w6, w6, w6
+ csrrs x2, 1984, x0
+ andi x2, x2, 4
+ bne x2, x0, ebgcd_s_odd
+
+ /* s is even: */
+ /* [w7,w6] = s <= s/2 = [w7,w6] >> 1 */
+ bn.rshi w6, w7, w6 >> 1
+ bn.rshi w7, w31, w7 >> 1
+ jal x0, ebgcd_loop
+
+ ebgcd_s_odd:
+ /* [w7,w6] = s <= (s + m)/2 = ([w7,w6] + [w13,w12]) >> 1 */
+ bn.add w6, w6, w12
+ bn.addc w7, w7, w13
+ bn.rshi w6, w7, w6 >> 1
+ bn.rshi w7, w31, w7 >> 1
+ jal x0, ebgcd_loop
+
+ ebgcd_uv_odd:
+ /* test if v >= u */
+ bn.cmp w10, w8
+ bn.cmpb w11, w9
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, ebgcd_v_gte_u
+
+ /* u > v: */
+ /* [w5,w4] = r <= r - s = [w5,w4] - [w7,w6]; if (r < 0): r <= r + m */
+ bn.sub w4, w4, w6
+ bn.subb w5, w5, w7
+ bn.add w18, w4, w12
+ bn.addc w19, w5, w13
+ bn.sel w4, w18, w4, C
+ bn.sel w5, w19, w5, C
+
+ /* [w9,w8] = u <= u - v = [w9,w8] - [w11,w10] */
+ bn.sub w8, w8, w10
+ bn.subb w9, w9, w11
+ jal x0, ebgcd_loop
+
+ ebgcd_v_gte_u:
+ /* [w7,w6] = s <= s - r = [w7,w6] - [w5,w4]; if (s < 0) s <= s + m */
+ bn.sub w6, w6, w4
+ bn.subb w7, w7, w5
+ bn.add w18, w6, w12
+ bn.addc w19, w7, w13
+ bn.sel w6, w18, w6, C
+ bn.sel w7, w19, w7, C
+
+ /* [w11,w10] = v <= v - u = [w11,w10] - [w9,w8] */
+ bn.sub w10, w10, w8
+ bn.subb w11, w11, w9
+
+ /* if v > 0 go back to start of loop */
+ bn.cmp w31, w10
+ bn.cmpb w31, w11
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ bne x2, x0, ebgcd_loop
+
+ /* v <= 0: */
+ /* if (r > m): [w17,w16] = a <= r - m = [w5,w4] - [w13,w12]
+ else: [w17,w16] = a <= r = [w5,w4] */
+ bn.sub w18, w4, w12
+ bn.subb w19, w5, w13
+ bn.cmp w12, w4
+ bn.cmpb w13, w5
+ bn.sel w16, w18, w4, C
+ bn.sel w17, w19, w5, C
+
+ ret
+
+
+/**
+ * Store curve point in projective coordinates (non randomized)
+ *
+ * Reads an affine P-384 from dmem, addressed by two independent pointers for
+ * the affine x- and y-coordinate respectively and stores the same point in
+ * projective form at another dmem location. The destination address is given
+ * by a single pointer. All 3 coordinates (x,y,z) are consecutively stored in
+ * this order in little endian format, 256 bit aligned.
+ *
+ * This routine does not randomize the point, hence the z-cooridnate is simply
+ * set to 1.
+ *
+ * @param[in] x10: dptr_x_a, pointer to affine x-coordinate of curve point
+ * @param[in] x11: dptr_y_a, pointer to affine y-coordinate of curve point
+ * @param[in] x12: dptr_proj, pointer to destination address
+ * @param[in] w31: all-zero
+ * @param[out] x12: next dmem address after stored point (256-bit aligned)
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, x12, w6 to w11
+ * clobbered flag groups: FG0
+ */
+store_aff_proj:
+
+ /* load point */
+ li x2, 6
+ bn.lid x2++, 0(x10)
+ bn.lid x2++, 32(x10)
+ bn.lid x2++, 0(x11)
+ bn.lid x2++, 32(x11)
+ bn.addi w10, w31, 1
+ bn.xor w11, w11, w11
+
+ /* store point */
+ li x2, 6
+ loopi 6, 2
+ bn.sid x2, 0(x12++)
+ addi x2, x2, 1
+ nop
+
+ ret
+
+
+/**
+ * Store curve point in projective coordinates (non randomized)
+ *
+ * Stores a P-384 curve point located in 6 consecutive WDRs at a dmem location
+ * given by a pointer. All 3 coordinates (x,y,z) are consecutively stored in
+ * this order in little endian format, 256 bit aligned.
+ *
+ * This routine does not randomize the point.
+ *
+ * @param[in] x12: dptr_proj, pointer to destination address
+ * @param[in] [w26,w25]: x-coordinate of curve point
+ * @param[in] [w28,w27]: y-coordinate of curve point
+ * @param[in] [w30,w29]: z-coordinate of curve point
+ * @param[out] x12: next dmem address after stored point (256-bit aligned)
+ *
+ * Flags: This routine doe not set any flags.
+ *
+ * clobbered registers: x2, x12
+ * clobbered flag groups: none
+ */
+store_proj:
+ li x2, 25
+ loopi 6, 2
+ bn.sid x2, 0(x12++)
+ addi x2, x2, 1
+ nop
+ ret
+
+/**
+ * P-384 ECDSA signature verification
+ *
+ * returns the affine x-coordinate of
+ * (x1, y1) = u1*G + u2*Q
+ * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n
+ * where G is the curve's base point,
+ * z is the message
+ * r, s is the signature
+ * Q is the public key.
+ *
+ * The routine computes the x1 coordinate and places it in dmem. x1 will be
+ * reduced (mod n), however, the final comparison has to be performed on the
+ * host side. The signature is valid if x1 == r.
+ * This routine runs in variable time.
+ *
+ * @param[in] dmem[4]: dptr_rnd, pointer to dmem location where the reduced
+ * affine x1-coordinate will be stored
+ * @param[in] dmem[8]: dptr_msg, pointer to the message to be verified in dmem
+ * @param[in] dmem[12]: dptr_r, pointer to s of signature in dmem
+ * @param[in] dmem[16]: dptr_s, pointer to r of signature in dmem
+ * @param[in] dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem
+ * @param[in] dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem
+ *
+ * Scratchpad memory layout:
+ * The routine expects at least 896 bytes of scratchpad memory at dmem
+ * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
+ * dptr_sp .. dptr_sp+191: point C, projective
+ * dptr_sp+192 .. dptr_sp+383: point G, projective
+ * dptr_sp+384 .. dptr_sp+575: point Q, projective
+ * dptr_sp+576 .. dptr_sp+767: point Q+G, projective
+ * dptr_sp+768 .. dptr_sp+831: scalar u1
+ * dptr_sp+832 .. dptr_sp+896: scalar u2
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2 to x5, x10, x11, x12, x22 to 28, w0 to w31
+ * clobbered flag groups: FG0
+ */
+.globl p384_verify
+p384_verify:
+
+ /* init all-zero reg */
+ bn.xor w31, w31, w31
+
+ /* load domain parameter n (order of base point)
+ [w13, w12] <= n = dmem[p384_n] */
+ li x2, 12
+ la x3, p384_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load s of signature from dmem
+ [w30,w29] <= s = dmem[*dptr_s] */
+ li x2, 29
+ la x3, dptr_s
+ lw x3, 0(x3)
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* goto 'fail' if [w30,w29] == [w31, w31] <=> s == 0 */
+ bn.cmp w31, w29
+ bn.cmpb w31, w30
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, fail
+
+ /* goto 'fail' if [w30,w29] >= [w12,w13] <=> s >= n */
+ bn.cmp w29, w12
+ bn.cmpb w30, w13
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, fail
+
+ /* Compute modular inverse of S
+ Note: This can be replaced by the 'mod_inv_n_p384' subroutine at the
+ cost of ~60k cycles if reduced code size is targeted */
+ /* [w9,w8] <= [w17,w16] <= s^-1 mod n = [w30,w29]^-1 mod [w13,w12] */
+ jal x1, mod_inv_var
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* load Barrett constant u_n for modulus n for scalar operations
+ [w15, w14] <= u_n = dmem[p384_u_n] */
+ li x2, 14
+ la x3, p384_u_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* set regfile pointers to in/out regs of Barrett routine */
+ li x22, 10
+ li x23, 11
+ li x24, 16
+ li x25, 17
+
+ /* load r of signature from dmem
+ [w11,w10] <= r = dmem[*dptr_r] */
+ li x2, 10
+ la x3, dptr_r
+ lw x3, 0(x3)
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* goto 'fail' if [w11, w10] == [w31, w31] <=> r == 0 */
+ bn.cmp w31, w10
+ bn.cmpb w31, w11
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, fail
+
+ /* goto 'fail' if [w11,w10] >= [w12,w13] <=> r >= n */
+ bn.cmp w10, w12
+ bn.cmpb w11, w13
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, fail
+
+ /* u2 = [w3,w2] <= [w17,w16] <= r*s^-1 mod n
+ = [w11,w10]*[w17,w16] mod [w13,w12] */
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+ /* left align */
+ bn.rshi w3, w3, w2 >> 128
+ bn.rshi w2, w2, w31 >> 128
+
+ /* load message from dmem
+ [w11,w10] <= msg = dmem[*dptr_msg] */
+ li x2, 10
+ la x3, dptr_msg
+ lw x3, 0(x3)
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* u1 = [w1,w0] <= [w17,w16] <= msg*s^-1 mod n
+ = [w11,w10]*[w9,w8] mod [w13,w12] */
+ bn.mov w16, w8
+ bn.mov w17, w9
+ jal x1, barrett384_p384
+ bn.mov w0, w16
+ bn.mov w1, w17
+ /* left align */
+ bn.rshi w1, w1, w0 >> 128
+ bn.rshi w0, w0, w31 >> 128
+
+ /* store u1 and u2 in scratchpad
+ scratchpad[768] <= u1; scratchpad[832] <= u2 */
+ li x2, 0
+ la x26, scratchpad
+ bn.sid x2++, 768(x26)
+ bn.sid x2++, 800(x26)
+ bn.sid x2++, 832(x26)
+ bn.sid x2++, 864(x26)
+
+ /* load domain parameter p (modulus)
+ [w13, w12] <= p = dmem[p384_p] */
+ li x2, 12
+ la x3, p384_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load Barrett constant u_p for modulus p
+ [w15, w14] = u_p = dmem[p384_u_p] */
+ li x2, 14
+ la x3, p384_u_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* set dmem pointer to domain parameter b */
+ la x28, p384_b
+
+ /* init double and add algorithm with C = (0, 1, 0)
+ GQ = (x,y,z) = scratchpad[0] <= (0, 1, 0) */
+ bn.xor w25, w25, w25
+ bn.xor w26, w26, w26
+ bn.addi w27, w31, 1
+ bn.xor w28, w28, w28
+ bn.xor w29, w29, w29
+ bn.xor w30, w30, w30
+ la x12, scratchpad
+ jal x1, store_proj
+
+ /* load base point G and use in projective form (set z to 1)
+ G = (x,y,z) = scratchpad[192] <= (dmem[p384_gy], dmem[p384_gy], 1) */
+ la x10, p384_gx
+ la x11, p384_gy
+ jal x1, store_aff_proj
+
+ /* load public key Q from dmem and use in projective form (set z to 1)
+ Q = (x,y,z) = scratchpad[384] <= (dmem[*dptr_x], dmem[*dptr_y], 1) */
+ la x3, dptr_x
+ lw x10, 0(x3)
+ la x3, dptr_y
+ lw x11, 0(x3)
+ jal x1, store_aff_proj
+
+ /* The remaining part of the routine implements a variable time
+ double-and-add algorithm. For the signature verification we need to
+ compute the point C = (x1, y1) = u1*G + _2*Q. This can be done in a
+ single double-and-add routine by using Shamir's Trick. */
+
+ /* Compute G+Q and store in dmem
+ GQ = (x,y,z) = dmem[dptr_sp+576]
+ <= sp[dptr_sp+192] (+) dmem[dptr_sp+384] */
+ la x26, scratchpad
+ addi x27, x26, 384
+ addi x26, x26, 192
+ jal x1, proj_add_p384
+ jal x1, store_proj
+
+ la x26, scratchpad
+
+ /* main loop with decreasing index i (i=383 downto 0) */
+ loopi 384, 35
+
+ /* probe MSBs of u1 and u2 and u1|u2 to determine which point has to be
+ added. */
+
+ /* load u1 and u2 from scratchpad
+ [w1,w0] <= u1; [w3, w2] = u2 */
+ li x2, 0
+ bn.lid x2++, 768(x26)
+ bn.lid x2++, 800(x26)
+ bn.lid x2++, 832(x26)
+ bn.lid x2++, 864(x26)
+
+ /* left shift u1 = [w1,w0] <= [w1,w0] << 1 */
+ bn.add w0, w0, w0
+ bn.addc w1, w1, w1
+
+ /* keep MSB/carry bit in x3: x3 <= u1[i] */
+ csrrs x3, 1984, x0
+ andi x3, x3, 1
+
+ /* left shift u2 = [w3,w2] <= [w3,w2] << 1 */
+ bn.add w2, w2, w2
+ bn.addc w3, w3, w3
+
+ /* keep MSB/carry bit in x3: x4 <= u2[i] */
+ csrrs x4, 1984, x0
+ andi x4, x4, 1
+ li x2, 0
+
+ /* write back u1 and u2 to scratchpad */
+ bn.sid x2++, 768(x26)
+ bn.sid x2++, 800(x26)
+ bn.sid x2++, 832(x26)
+ bn.sid x2++, 864(x26)
+
+ /* test if at least one MSb of the scalars is 1
+ x5 <= x4 | x3 = u1[i] | u2[i] */
+ or x5, x4, x3
+
+ /* always double, let both input pointers for point addition point to C */
+ add x27, x26, x0
+
+ /* no addition if x5 = u1[i] | u2[i] == 0 */
+ beq x5, x0, ver_end_loop
+
+ /* perform point doubling C <= 2 (*) C */
+ jal x1, proj_add_p384
+ addi x12, x26, 0
+ jal x1, store_proj
+
+ /* check if u1[i] is set */
+ bne x3, x0, u1_set
+
+ /* only u2[i] is set: do C <= C + Q */
+ addi x27, x26, 384
+ jal x0, ver_end_loop
+
+ u1_set:
+ /* chek if u2[i] is set as well */
+ bne x4, x0, both
+
+ /* only u1[i] is set: do C <= C + G */
+ add x27, x26, 192
+ jal x0, ver_end_loop
+
+ /* both bits at current index (u1[i] and u2[i]) are set:
+ do: C <= C + (G + Q) */
+ both:
+ addi x27, x26, 576
+
+ ver_end_loop:
+ /* perform addition of selected point here, or point doubling in case
+ of no addition */
+ jal x1, proj_add_p384
+ addi x12, x26, 0
+ jal x1, store_proj
+ nop
+
+ /* compute inverse of z-coordinate: [w1,w0] <= z_c^-1 mod p */
+ jal x1, mod_inv_var
+
+ /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */
+ bn.mov w10, w25
+ bn.mov w11, w26
+ jal x1, barrett384_p384
+
+ /* load domain parameter n (order of base point)
+ [w13, w12] <= n = dmem[p384_n] */
+ li x2, 12
+ la x3, p384_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* final reduction: [w5,w4] = x1 <= x1 mod n = [w17,w16] mod [w13,w12] */
+ bn.sub w4, w16, w12
+ bn.subb w5, w17, w13
+ bn.sel w4, w16, w4, C
+ bn.sel w5, w17, w5, C
+
+ fail:
+
+ /* store affine x-coordinate in dmem: dmem[dptr_rnd] <= x1 = [w5,w4] */
+ li x2, 4
+ la x3, dptr_rnd
+ lw x3, 0(x3)
+ bn.sid x2++, 0(x3)
+ bn.sid x2++, 32(x3)
+
+ ret
+
+
+/* pointers and scratchpad memory */
+.section .data
+
+/* pointer to k (dptr_k) */
+.globl dptr_k
+dptr_k:
+ .zero 4
+
+/* pointer to rnd (dptr_rnd)
+ used for result here */
+.globl dptr_rnd
+dptr_rnd:
+ .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+ .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+ .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+ .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+dptr_x:
+ .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+dptr_y:
+ .zero 4
+
+/* pointer to D (dptr_d) */
+.globl dptr_d
+dptr_d:
+ .zero 4
+
+/* 768 bytes of scratchpad memory */
+scratchpad:
+ .zero 896
diff --git a/sw/otbn/code-snippets/rules.mk b/sw/otbn/code-snippets/rules.mk
index 646461f..4bd22fb 100644
--- a/sw/otbn/code-snippets/rules.mk
+++ b/sw/otbn/code-snippets/rules.mk
@@ -155,7 +155,7 @@
$(otbn-code-snippets-bin-dir)/p384_proj_add_test.elf: \
otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
-# code in p384_sign depends on code defined in p384_base.s
+# code in p384_sign.s depends on code defined in p384_base.s
$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
$(otbn-code-snippets-obj-dir)/p384_base.o
$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
@@ -163,14 +163,30 @@
# p384_scalar_mult_test depends on scalar_mult_p384 defined in p384_sign.s
$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
- $(otbn-code-snippets-obj-dir)/p384_sign.o
+ $(otbn-code-snippets-obj-dir)/p384_sign.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
- $(otbn-code-snippets-obj-dir)/p384_base.o
+ $(otbn-code-snippets-obj-dir)/p384_base.o
# p384_ecdsa_sign_test depends on p384_sign defined in p384_sign.s
$(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
- $(otbn-code-snippets-obj-dir)/p384_sign.o
+ $(otbn-code-snippets-obj-dir)/p384_sign.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
$(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
+
+# code in p384_verify.s depends on code defined in p384_base.s
+$(otbn-code-snippets-bin-dir)/p384_verify.elf: \
$(otbn-code-snippets-obj-dir)/p384_base.o
+$(otbn-code-snippets-bin-dir)/p384_verify.elf: \
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
+
+# p384_ecdsa_verifiy_test depends on p384_verify defined in p384_verify.s
+$(otbn-code-snippets-bin-dir)/p384_ecdsa_verify_test.elf: \
+ $(otbn-code-snippets-obj-dir)/p384_verify.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
+$(otbn-code-snippets-bin-dir)/p384_ecdsa_verify_test.elf: \
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_verify.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o