| /* Copyright lowRISC contributors. */ |
| /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ |
| /* SPDX-License-Identifier: Apache-2.0 */ |
| /* |
| * P-384 specific routines for ECDSA signature verification and curve point |
| * test. |
| */ |
| |
| .section .text |
| |
| /** |
| * Checks if a point is a valid curve point on curve P-384 |
| * |
| * Returns r = x^3 + ax + b mod p |
| * and s = y^2 mod p |
| * where x,y are the affine coordinates of the curve point and |
| * a, b and p being the domain parameters of curve P-384. |
| * |
| * This routine checks if a point with given x- and y-coordinate is a valid |
| * curve point on P-384. |
| * The routine checks whether the coordinates are a solution of the |
| * Weierstrass equation y^2 = x^3 + ax + b mod p. |
| * The routine makes use of the property that the domain parameter 'a' can be |
| * written as a=-3 for the P-384 curve, hence the routine is limited to P-384. |
| * The routine does not return a boolean result but computes the left side |
| * and the right sight of the Weierstrass equation and leaves the final |
| * comparison to the caller. |
| * The routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] dmem[12]: dptr_r, pointer to dmem location where right |
| * side result r will be stored |
| * @param[in] dmem[16]: dptr_s, pointer to dmem location where left side |
| * result s will be stored |
| * @param[in] dmem[20]: dptr_x, pointer to dmem location containing affine |
| * x-coordinate of input point |
| * @param[in] dmem[24]: dptr_y, pointer to dmem location containing affine |
| * y-coordinate of input point |
| * |
| * clobbered registers: x2, x3, w0 to w5, w10 to w17 |
| * clobbered flag groups: FG0 |
| */ |
| .globl p384_isoncurve |
| p384_isoncurve: |
| |
| /* setup all-zero reg */ |
| bn.xor w31, w31, w31 |
| |
| /* load affine x-coordinate of curve point from dmem |
| [w1, w0] <= dmem[dptr_x] = dmem[20] */ |
| la x3, dptr_x |
| lw x3, 0(x3) |
| li x2, 0 |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* load affine y-coordinate of curve point from dmem |
| [w3, w2] <= dmem[dptr_y] = dmem[24] */ |
| la x3, dptr_y |
| lw x3, 0(x3) |
| bn.lid x2++, 0(x3) |
| bn.lid x2, 32(x3) |
| |
| /* load domain parameter p (modulus) from dmem |
| [w13, w12] = p = dmem[p384_p] */ |
| li x2, 12 |
| la x3, p384_p |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* load domain parameter b from dmem |
| [w4, w5] = b = dmem[p384_b] */ |
| li x2, 4 |
| la x3, p384_b |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */ |
| bn.mov w10, w2 |
| bn.mov w11, w3 |
| bn.mov w16, w2 |
| bn.mov w17, w3 |
| jal x1, p384_mulmod_p |
| |
| /* store result (left side): dmem[dptr_s] <= y^2 = [w17,w16] */ |
| la x3, dptr_s |
| lw x3, 0(x3) |
| li x2, 16 |
| bn.sid x2++, 0(x3) |
| bn.sid x2++, 32(x3) |
| |
| /* x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */ |
| bn.mov w10, w0 |
| bn.mov w11, w1 |
| bn.mov w16, w0 |
| bn.mov w17, w1 |
| jal x1, p384_mulmod_p |
| bn.mov w10, w0 |
| bn.mov w11, w1 |
| jal x1, p384_mulmod_p |
| |
| /* for curve P-384, 'a' can be written as a = -3, therefore we subtract |
| x three times from x^3. |
| x^3 + ax mod p = [w17,w16] <= x^3 -3 x mod p |
| = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */ |
| loopi 3, 6 |
| bn.sub w16, w16, w0 |
| bn.subb w17, w17, w1 |
| bn.add w10, w16, w12 |
| bn.addc w11, w17, w13 |
| bn.sel w16, w10, w16, C |
| bn.sel w17, w11, w17, C |
| |
| /* add domain parameter b |
| x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */ |
| bn.add w16, w16, w4 |
| bn.addc w17, w17, w5 |
| bn.sub w10, w16, w12 |
| bn.subb w11, w17, w13 |
| bn.sel w16, w16, w10, C |
| bn.sel w17, w17, w11, C |
| |
| /* store result (right side) |
| dmem[dptr_r] <= x^3 + ax + b mod p = [w17,w16] */ |
| la x3, dptr_r |
| lw x3, 0(x3) |
| li x2, 16 |
| bn.sid x2++, 0(x3) |
| bn.sid x2++, 32(x3) |
| |
| ret |
| |
| |
| /** |
| * 384-bit variable time modular multiplicative inverse computation |
| * |
| * Returns c <= a^(-1) mod m |
| * where 'a' is a bigint of length 384 bit with a < m |
| * 'm' is the modulus with a length of 384 bit |
| * 'c' is a 384-bit result |
| * |
| * This routine implements the computation of the modular multiplicative |
| * inverse based on the binary GCD or Stein's algorithm. |
| * The implemented variant is based on the "right-shift binary extended GCD" |
| * as it is described in section 3.1 of [1] (Algorithm 1). |
| * [1] https://doi.org/10.1155/ES/2006/32192 |
| * |
| * Note that this is a variable time implementation. I.e. this routine will |
| * show a data-dependent timing and execution profile. Only use where a |
| * full white-box scenario is acceptable. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] [w30, w29]: a, 384-bit operand |
| * @param[in] [w13, w12]: m, modulus |
| * @param[in] w31: all-zero |
| * @param[out] [w17,w16]: result c |
| * |
| * clobbered registers: x2, w2, w4 to w11, w16 to w19 |
| * clobbered flag groups: FG0 |
| */ |
| mod_inv_var: |
| /* [w5,w4] = r <= 0 */ |
| bn.xor w4, w4, w4 |
| bn.xor w5, w5, w5 |
| |
| /* [w7,w6] = s <= 1 */ |
| bn.addi w6, w31, 1 |
| bn.xor w7, w7, w7 |
| |
| /* [w9,w8] = u <= m = [w13, w12]*/ |
| bn.mov w8, w12 |
| bn.mov w9, w13 |
| |
| /* [w11,w10] = v <= [w30, w29] */ |
| bn.mov w10, w29 |
| bn.mov w11, w30 |
| |
| ebgcd_loop: |
| /* test if u is odd */ |
| bn.or w8, w8, w8 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_u_odd |
| |
| /* u is even: */ |
| /* [w9,w8] = u <= u/2 = [w9,w8] >> 1 */ |
| bn.rshi w8, w9, w8 >> 1 |
| bn.rshi w9, w31, w9 >> 1 |
| |
| /* test if r is odd */ |
| bn.or w4, w4, w4 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_r_odd |
| |
| /* r is even: */ |
| /* [w5,w4] = r <= r/2 = [w5,w4] >> 1 */ |
| bn.rshi w4, w5, w4 >> 1 |
| bn.rshi w5, w31, w5 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_r_odd: |
| /* [w5,w4] = r <= (r + m)/2 = ([w5,w4] + [w13,w12]) >> 1 */ |
| bn.add w4, w4, w12 |
| bn.addc w5, w5, w13 |
| bn.rshi w4, w5, w4 >> 1 |
| bn.rshi w5, w31, w5 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_u_odd: |
| /* test if v is odd */ |
| bn.or w10, w10, w10 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_uv_odd |
| |
| /* v is even: */ |
| /* [w11,w10] = v <= v/2 = [w11,w10] >> 1 */ |
| bn.rshi w10, w11, w10 >> 1 |
| bn.rshi w11, w31, w11 >> 1 |
| |
| /* test if s is odd */ |
| bn.or w6, w6, w6 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 4 |
| bne x2, x0, ebgcd_s_odd |
| |
| /* s is even: */ |
| /* [w7,w6] = s <= s/2 = [w7,w6] >> 1 */ |
| bn.rshi w6, w7, w6 >> 1 |
| bn.rshi w7, w31, w7 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_s_odd: |
| /* [w7,w6] = s <= (s + m)/2 = ([w7,w6] + [w13,w12]) >> 1 */ |
| bn.add w6, w6, w12 |
| bn.addc w7, w7, w13 |
| bn.rshi w6, w7, w6 >> 1 |
| bn.rshi w7, w31, w7 >> 1 |
| jal x0, ebgcd_loop |
| |
| ebgcd_uv_odd: |
| /* test if v >= u */ |
| bn.cmp w10, w8 |
| bn.cmpb w11, w9 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, ebgcd_v_gte_u |
| |
| /* u > v: */ |
| /* [w5,w4] = r <= r - s = [w5,w4] - [w7,w6]; if (r < 0): r <= r + m */ |
| bn.sub w4, w4, w6 |
| bn.subb w5, w5, w7 |
| bn.add w18, w4, w12 |
| bn.addc w19, w5, w13 |
| bn.sel w4, w18, w4, C |
| bn.sel w5, w19, w5, C |
| |
| /* [w9,w8] = u <= u - v = [w9,w8] - [w11,w10] */ |
| bn.sub w8, w8, w10 |
| bn.subb w9, w9, w11 |
| jal x0, ebgcd_loop |
| |
| ebgcd_v_gte_u: |
| /* [w7,w6] = s <= s - r = [w7,w6] - [w5,w4]; if (s < 0) s <= s + m */ |
| bn.sub w6, w6, w4 |
| bn.subb w7, w7, w5 |
| bn.add w18, w6, w12 |
| bn.addc w19, w7, w13 |
| bn.sel w6, w18, w6, C |
| bn.sel w7, w19, w7, C |
| |
| /* [w11,w10] = v <= v - u = [w11,w10] - [w9,w8] */ |
| bn.sub w10, w10, w8 |
| bn.subb w11, w11, w9 |
| |
| /* if v > 0 go back to start of loop */ |
| bn.cmp w31, w10 |
| bn.cmpb w31, w11 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| bne x2, x0, ebgcd_loop |
| |
| /* v <= 0: */ |
| /* if (r > m): [w17,w16] = a <= r - m = [w5,w4] - [w13,w12] |
| else: [w17,w16] = a <= r = [w5,w4] */ |
| bn.sub w18, w4, w12 |
| bn.subb w19, w5, w13 |
| bn.cmp w12, w4 |
| bn.cmpb w13, w5 |
| bn.sel w16, w18, w4, C |
| bn.sel w17, w19, w5, C |
| |
| ret |
| |
| |
| /** |
| * Store curve point in projective coordinates (non randomized) |
| * |
| * Reads an affine P-384 from dmem, addressed by two independent pointers for |
| * the affine x- and y-coordinate respectively and stores the same point in |
| * projective form at another dmem location. The destination address is given |
| * by a single pointer. All 3 coordinates (x,y,z) are consecutively stored in |
| * this order in little endian format, 256 bit aligned. |
| * |
| * This routine does not randomize the point, hence the z-cooridnate is simply |
| * set to 1. |
| * |
| * @param[in] x10: dptr_x_a, pointer to affine x-coordinate of curve point |
| * @param[in] x11: dptr_y_a, pointer to affine y-coordinate of curve point |
| * @param[in] x12: dptr_proj, pointer to destination address |
| * @param[in] w31: all-zero |
| * @param[out] x12: next dmem address after stored point (256-bit aligned) |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: x2, x12, w6 to w11 |
| * clobbered flag groups: FG0 |
| */ |
| store_aff_proj: |
| |
| /* load point */ |
| li x2, 6 |
| bn.lid x2++, 0(x10) |
| bn.lid x2++, 32(x10) |
| bn.lid x2++, 0(x11) |
| bn.lid x2++, 32(x11) |
| bn.addi w10, w31, 1 |
| bn.xor w11, w11, w11 |
| |
| /* store point */ |
| li x2, 6 |
| loopi 6, 2 |
| bn.sid x2, 0(x12++) |
| addi x2, x2, 1 |
| nop |
| |
| ret |
| |
| |
| /** |
| * Store curve point in projective coordinates (non randomized) |
| * |
| * Stores a P-384 curve point located in 6 consecutive WDRs at a dmem location |
| * given by a pointer. All 3 coordinates (x,y,z) are consecutively stored in |
| * this order in little endian format, 256 bit aligned. |
| * |
| * This routine does not randomize the point. |
| * |
| * @param[in] x12: dptr_proj, pointer to destination address |
| * @param[in] [w26,w25]: x-coordinate of curve point |
| * @param[in] [w28,w27]: y-coordinate of curve point |
| * @param[in] [w30,w29]: z-coordinate of curve point |
| * @param[out] x12: next dmem address after stored point (256-bit aligned) |
| * |
| * Flags: This routine doe not set any flags. |
| * |
| * clobbered registers: x2, x12 |
| * clobbered flag groups: none |
| */ |
| store_proj: |
| li x2, 25 |
| loopi 6, 2 |
| bn.sid x2, 0(x12++) |
| addi x2, x2, 1 |
| nop |
| ret |
| |
| /** |
| * P-384 ECDSA signature verification |
| * |
| * returns the affine x-coordinate of |
| * (x1, y1) = u1*G + u2*Q |
| * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n |
| * where G is the curve's base point, |
| * z is the message |
| * r, s is the signature |
| * Q is the public key. |
| * |
| * The routine computes the x1 coordinate and places it in dmem. x1 will be |
| * reduced (mod n), however, the final comparison has to be performed on the |
| * host side. The signature is valid if x1 == r. |
| * This routine runs in variable time. |
| * |
| * @param[in] dmem[4]: dptr_rnd, pointer to dmem location where the reduced |
| * affine x1-coordinate will be stored |
| * @param[in] dmem[8]: dptr_msg, pointer to the message to be verified in dmem |
| * @param[in] dmem[12]: dptr_r, pointer to r of signature in dmem |
| * @param[in] dmem[16]: dptr_s, pointer to s of signature in dmem |
| * @param[in] dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem |
| * @param[in] dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem |
| * |
| * Scratchpad memory layout: |
| * The routine expects at least 896 bytes of scratchpad memory at dmem |
| * location 'scratchpad' (sp). Internally the scratchpad is used as follows: |
| * dptr_sp .. dptr_sp+191: point C, projective |
| * dptr_sp+192 .. dptr_sp+383: point G, projective |
| * dptr_sp+384 .. dptr_sp+575: point Q, projective |
| * dptr_sp+576 .. dptr_sp+767: point Q+G, projective |
| * dptr_sp+768 .. dptr_sp+831: scalar u1 |
| * dptr_sp+832 .. dptr_sp+896: scalar u2 |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * clobbered registers: x2 to x5, x10, x11, x12, x22 to 28, w0 to w31 |
| * clobbered flag groups: FG0 |
| */ |
| .globl p384_verify |
| p384_verify: |
| |
| /* init all-zero reg */ |
| bn.xor w31, w31, w31 |
| |
| /* load domain parameter n (order of base point) |
| [w13, w12] <= n = dmem[p384_n] */ |
| li x2, 12 |
| la x3, p384_n |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* load s of signature from dmem |
| [w30,w29] <= s = dmem[*dptr_s] */ |
| li x2, 29 |
| la x3, dptr_s |
| lw x3, 0(x3) |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* goto 'fail' if [w30,w29] == [w31, w31] <=> s == 0 */ |
| bn.cmp w31, w29 |
| bn.cmpb w31, w30 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* goto 'fail' if [w30,w29] >= [w12,w13] <=> s >= n */ |
| bn.cmp w29, w12 |
| bn.cmpb w30, w13 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* Compute modular inverse of S |
| Note: This can be replaced by the 'mod_inv_n_p384' subroutine at the |
| cost of ~60k cycles if reduced code size is targeted */ |
| /* [w9,w8] <= [w17,w16] <= s^-1 mod n = [w30,w29]^-1 mod [w13,w12] */ |
| jal x1, mod_inv_var |
| bn.mov w8, w16 |
| bn.mov w9, w17 |
| |
| /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so |
| no need to compute the high part): |
| w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ |
| bn.sub w14, w31, w12 |
| |
| /* set regfile pointers to in/out regs of Barrett routine */ |
| li x22, 10 |
| li x23, 11 |
| li x24, 16 |
| li x25, 17 |
| |
| /* load r of signature from dmem |
| [w11,w10] <= r = dmem[*dptr_r] */ |
| li x2, 10 |
| la x3, dptr_r |
| lw x3, 0(x3) |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* goto 'fail' if [w11, w10] == [w31, w31] <=> r == 0 */ |
| bn.cmp w31, w10 |
| bn.cmpb w31, w11 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* goto 'fail' if [w11,w10] >= [w12,w13] <=> r >= n */ |
| bn.cmp w10, w12 |
| bn.cmpb w11, w13 |
| csrrs x2, 0x7c0, x0 |
| andi x2, x2, 1 |
| beq x2, x0, fail |
| |
| /* u2 = [w3,w2] <= [w17,w16] <= r*s^-1 mod n |
| = [w11,w10]*[w17,w16] mod [w13,w12] */ |
| jal x1, p384_mulmod_n |
| bn.mov w2, w16 |
| bn.mov w3, w17 |
| /* left align */ |
| bn.rshi w3, w3, w2 >> 128 |
| bn.rshi w2, w2, w31 >> 128 |
| |
| /* load message from dmem |
| [w11,w10] <= msg = dmem[*dptr_msg] */ |
| li x2, 10 |
| la x3, dptr_msg |
| lw x3, 0(x3) |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* u1 = [w1,w0] <= [w17,w16] <= msg*s^-1 mod n |
| = [w11,w10]*[w9,w8] mod [w13,w12] */ |
| bn.mov w16, w8 |
| bn.mov w17, w9 |
| jal x1, p384_mulmod_n |
| bn.mov w0, w16 |
| bn.mov w1, w17 |
| /* left align */ |
| bn.rshi w1, w1, w0 >> 128 |
| bn.rshi w0, w0, w31 >> 128 |
| |
| /* store u1 and u2 in scratchpad |
| scratchpad[768] <= u1; scratchpad[832] <= u2 */ |
| li x2, 0 |
| la x26, scratchpad |
| bn.sid x2++, 768(x26) |
| bn.sid x2++, 800(x26) |
| bn.sid x2++, 832(x26) |
| bn.sid x2++, 864(x26) |
| |
| /* load domain parameter p (modulus) |
| [w13, w12] <= p = dmem[p384_p] */ |
| li x2, 12 |
| la x3, p384_p |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* set dmem pointer to domain parameter b */ |
| la x28, p384_b |
| |
| /* init double and add algorithm with C = (0, 1, 0) |
| GQ = (x,y,z) = scratchpad[0] <= (0, 1, 0) */ |
| bn.xor w25, w25, w25 |
| bn.xor w26, w26, w26 |
| bn.addi w27, w31, 1 |
| bn.xor w28, w28, w28 |
| bn.xor w29, w29, w29 |
| bn.xor w30, w30, w30 |
| la x12, scratchpad |
| jal x1, store_proj |
| |
| /* load base point G and use in projective form (set z to 1) |
| G = (x,y,z) = scratchpad[192] <= (dmem[p384_gy], dmem[p384_gy], 1) */ |
| la x10, p384_gx |
| la x11, p384_gy |
| jal x1, store_aff_proj |
| |
| /* load public key Q from dmem and use in projective form (set z to 1) |
| Q = (x,y,z) = scratchpad[384] <= (dmem[*dptr_x], dmem[*dptr_y], 1) */ |
| la x3, dptr_x |
| lw x10, 0(x3) |
| la x3, dptr_y |
| lw x11, 0(x3) |
| jal x1, store_aff_proj |
| |
| /* The remaining part of the routine implements a variable time |
| double-and-add algorithm. For the signature verification we need to |
| compute the point C = (x1, y1) = u1*G + _2*Q. This can be done in a |
| single double-and-add routine by using Shamir's Trick. */ |
| |
| /* Compute G+Q and store in dmem |
| GQ = (x,y,z) = dmem[dptr_sp+576] |
| <= sp[dptr_sp+192] (+) dmem[dptr_sp+384] */ |
| la x26, scratchpad |
| addi x27, x26, 384 |
| addi x26, x26, 192 |
| jal x1, proj_add_p384 |
| jal x1, store_proj |
| |
| la x26, scratchpad |
| |
| /* main loop with decreasing index i (i=383 downto 0) */ |
| loopi 384, 35 |
| |
| /* probe MSBs of u1 and u2 and u1|u2 to determine which point has to be |
| added. */ |
| |
| /* load u1 and u2 from scratchpad |
| [w1,w0] <= u1; [w3, w2] = u2 */ |
| li x2, 0 |
| bn.lid x2++, 768(x26) |
| bn.lid x2++, 800(x26) |
| bn.lid x2++, 832(x26) |
| bn.lid x2++, 864(x26) |
| |
| /* left shift u1 = [w1,w0] <= [w1,w0] << 1 */ |
| bn.add w0, w0, w0 |
| bn.addc w1, w1, w1 |
| |
| /* keep MSB/carry bit in x3: x3 <= u1[i] */ |
| csrrs x3, 0x7c0, x0 |
| andi x3, x3, 1 |
| |
| /* left shift u2 = [w3,w2] <= [w3,w2] << 1 */ |
| bn.add w2, w2, w2 |
| bn.addc w3, w3, w3 |
| |
| /* keep MSB/carry bit in x3: x4 <= u2[i] */ |
| csrrs x4, 0x7c0, x0 |
| andi x4, x4, 1 |
| li x2, 0 |
| |
| /* write back u1 and u2 to scratchpad */ |
| bn.sid x2++, 768(x26) |
| bn.sid x2++, 800(x26) |
| bn.sid x2++, 832(x26) |
| bn.sid x2++, 864(x26) |
| |
| /* test if at least one MSb of the scalars is 1 |
| x5 <= x4 | x3 = u1[i] | u2[i] */ |
| or x5, x4, x3 |
| |
| /* always double, let both input pointers for point addition point to C */ |
| add x27, x26, x0 |
| |
| /* no addition if x5 = u1[i] | u2[i] == 0 */ |
| beq x5, x0, ver_end_loop |
| |
| /* perform point doubling C <= 2 (*) C */ |
| jal x1, proj_add_p384 |
| addi x12, x26, 0 |
| jal x1, store_proj |
| |
| /* check if u1[i] is set */ |
| bne x3, x0, u1_set |
| |
| /* only u2[i] is set: do C <= C + Q */ |
| addi x27, x26, 384 |
| jal x0, ver_end_loop |
| |
| u1_set: |
| /* chek if u2[i] is set as well */ |
| bne x4, x0, both |
| |
| /* only u1[i] is set: do C <= C + G */ |
| add x27, x26, 192 |
| jal x0, ver_end_loop |
| |
| /* both bits at current index (u1[i] and u2[i]) are set: |
| do: C <= C + (G + Q) */ |
| both: |
| addi x27, x26, 576 |
| |
| ver_end_loop: |
| /* perform addition of selected point here, or point doubling in case |
| of no addition */ |
| jal x1, proj_add_p384 |
| addi x12, x26, 0 |
| jal x1, store_proj |
| nop |
| |
| /* compute inverse of z-coordinate: [w1,w0] <= z_c^-1 mod p */ |
| jal x1, mod_inv_var |
| |
| /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ |
| bn.mov w10, w25 |
| bn.mov w11, w26 |
| jal x1, p384_mulmod_p |
| |
| /* load domain parameter n (order of base point) |
| [w13, w12] <= n = dmem[p384_n] */ |
| li x2, 12 |
| la x3, p384_n |
| bn.lid x2++, 0(x3) |
| bn.lid x2++, 32(x3) |
| |
| /* final reduction: [w5,w4] = x1 <= x1 mod n = [w17,w16] mod [w13,w12] */ |
| bn.sub w4, w16, w12 |
| bn.subb w5, w17, w13 |
| bn.sel w4, w16, w4, C |
| bn.sel w5, w17, w5, C |
| |
| fail: |
| |
| /* store affine x-coordinate in dmem: dmem[dptr_rnd] <= x1 = [w5,w4] */ |
| li x2, 4 |
| la x3, dptr_rnd |
| lw x3, 0(x3) |
| bn.sid x2++, 0(x3) |
| bn.sid x2++, 32(x3) |
| |
| ret |
| |
| |
| /* pointers and scratchpad memory */ |
| .section .data |
| |
| /* pointer to k (dptr_k) */ |
| .globl dptr_k |
| dptr_k: |
| .zero 4 |
| |
| /* pointer to rnd (dptr_rnd) |
| used for result here */ |
| .globl dptr_rnd |
| dptr_rnd: |
| .zero 4 |
| |
| /* pointer to msg (dptr_msg) */ |
| .globl dptr_msg |
| dptr_msg: |
| .zero 4 |
| |
| /* pointer to R (dptr_r) */ |
| .globl dptr_r |
| dptr_r: |
| .zero 4 |
| |
| /* pointer to S (dptr_s) */ |
| .globl dptr_s |
| dptr_s: |
| .zero 4 |
| |
| /* pointer to X (dptr_x) */ |
| .globl dptr_x |
| dptr_x: |
| .zero 4 |
| |
| /* pointer to Y (dptr_y) */ |
| .globl dptr_y |
| dptr_y: |
| .zero 4 |
| |
| /* pointer to D (dptr_d) */ |
| .globl dptr_d |
| dptr_d: |
| .zero 4 |
| |
| /* Scratchpad memory */ |
| scratchpad: |
| .zero 896 |