|  | /* Copyright lowRISC contributors. */ | 
|  | /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ | 
|  | /* SPDX-License-Identifier: Apache-2.0 */ | 
|  | /* | 
|  | *   P-384 specific routines for ECDSA signature verification and curve point | 
|  | *   test. | 
|  | */ | 
|  |  | 
|  | .section .text | 
|  |  | 
|  | /** | 
|  | * Checks if a point is a valid curve point on curve P-384 | 
|  | * | 
|  | * Returns r = x^3 + ax + b  mod p | 
|  | *     and s = y^2  mod p | 
|  | *         where x,y are the affine coordinates of the curve point and | 
|  | *              a, b and p being the domain parameters of curve P-384. | 
|  | * | 
|  | * This routine checks if a point with given x- and y-coordinate is a valid | 
|  | * curve point on P-384. | 
|  | * The routine checks whether the coordinates are a solution of the | 
|  | * Weierstrass equation y^2 = x^3 + ax + b  mod p. | 
|  | * The routine makes use of the property that the domain parameter 'a' can be | 
|  | * written as a=-3 for the P-384 curve, hence the routine is limited to P-384. | 
|  | * The routine does not return a boolean result but computes the left side | 
|  | * and the right sight of the Weierstrass equation and leaves the final | 
|  | * comparison to the caller. | 
|  | * The routine runs in constant time. | 
|  | * | 
|  | * Flags: Flags have no meaning beyond the scope of this subroutine. | 
|  | * | 
|  | * @param[in]  dmem[12]: dptr_r, pointer to dmem location where right | 
|  | *                               side result r will be stored | 
|  | * @param[in]  dmem[16]: dptr_s, pointer to dmem location where left side | 
|  | *                               result s will be stored | 
|  | * @param[in]  dmem[20]: dptr_x, pointer to dmem location containing affine | 
|  | *                               x-coordinate of input point | 
|  | * @param[in]  dmem[24]: dptr_y, pointer to dmem location containing affine | 
|  | *                               y-coordinate of input point | 
|  | * | 
|  | * clobbered registers: x2, x3, w0 to w5, w10 to w17 | 
|  | * clobbered flag groups: FG0 | 
|  | */ | 
|  | .globl p384_isoncurve | 
|  | p384_isoncurve: | 
|  |  | 
|  | /* setup all-zero reg */ | 
|  | bn.xor    w31, w31, w31 | 
|  |  | 
|  | /* load affine x-coordinate of curve point from dmem | 
|  | [w1, w0] <= dmem[dptr_x] = dmem[20] */ | 
|  | la        x3, dptr_x | 
|  | lw        x3, 0(x3) | 
|  | li        x2, 0 | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* load affine y-coordinate of curve point from dmem | 
|  | [w3, w2] <= dmem[dptr_y] = dmem[24] */ | 
|  | la        x3, dptr_y | 
|  | lw        x3, 0(x3) | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2, 32(x3) | 
|  |  | 
|  | /* load domain parameter p (modulus) from dmem | 
|  | [w13, w12] = p = dmem[p384_p] */ | 
|  | li        x2, 12 | 
|  | la        x3, p384_p | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* load Barrett constant u for modulus p from dmem | 
|  | [w15, w14] = u_p = dmem[p384_u_p] */ | 
|  | li        x2, 14 | 
|  | la        x3, p384_u_p | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* load domain parameter b from dmem | 
|  | [w4, w5] = b = dmem[p384_b] */ | 
|  | li        x2, 4 | 
|  | la        x3, p384_b | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */ | 
|  | bn.mov    w10, w2 | 
|  | bn.mov    w11, w3 | 
|  | bn.mov    w16, w2 | 
|  | bn.mov    w17, w3 | 
|  | jal       x1, barrett384_p384 | 
|  |  | 
|  | /* store result (left side): dmem[dptr_s] <= y^2 = [w17,w16] */ | 
|  | la        x3, dptr_s | 
|  | lw        x3, 0(x3) | 
|  | li        x2, 16 | 
|  | bn.sid    x2++, 0(x3) | 
|  | bn.sid    x2++, 32(x3) | 
|  |  | 
|  | /*  x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */ | 
|  | bn.mov    w10, w0 | 
|  | bn.mov    w11, w1 | 
|  | bn.mov    w16, w0 | 
|  | bn.mov    w17, w1 | 
|  | jal       x1, barrett384_p384 | 
|  | bn.mov    w10, w0 | 
|  | bn.mov    w11, w1 | 
|  | jal       x1, barrett384_p384 | 
|  |  | 
|  | /* for curve P-384, 'a' can be written as a = -3, therefore we subtract | 
|  | x three times from x^3. | 
|  | x^3 + ax  mod p = [w17,w16] <= x^3 -3 x mod p | 
|  | = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */ | 
|  | loopi     3, 6 | 
|  | bn.sub    w16, w16, w0 | 
|  | bn.subb   w17, w17, w1 | 
|  | bn.add    w10, w16, w12 | 
|  | bn.addc   w11, w17, w13 | 
|  | bn.sel    w16, w10, w16, C | 
|  | bn.sel    w17, w11, w17, C | 
|  |  | 
|  | /* add domain parameter b | 
|  | x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */ | 
|  | bn.add    w16, w16, w4 | 
|  | bn.addc   w17, w17, w5 | 
|  | bn.sub    w10, w16, w12 | 
|  | bn.subb   w11, w17, w13 | 
|  | bn.sel    w16, w16, w10, C | 
|  | bn.sel    w17, w17, w11, C | 
|  |  | 
|  | /* store result (right side) | 
|  | dmem[dptr_r] <= x^3 + ax + b mod p = [w17,w16] */ | 
|  | la        x3, dptr_r | 
|  | lw        x3, 0(x3) | 
|  | li        x2, 16 | 
|  | bn.sid    x2++, 0(x3) | 
|  | bn.sid    x2++, 32(x3) | 
|  |  | 
|  | ret | 
|  |  | 
|  |  | 
|  | /** | 
|  | * 384-bit variable time modular multiplicative inverse computation | 
|  | * | 
|  | * Returns c <= a^(-1) mod m | 
|  | *         where 'a' is a bigint of length 384 bit with a < m | 
|  | *               'm' is the modulus with a length of 384 bit | 
|  | *               'c' is a 384-bit result | 
|  | * | 
|  | * This routine implements the computation of the modular multiplicative | 
|  | * inverse based on the binary GCD or Stein's algorithm. | 
|  | * The implemented variant is based on the "right-shift binary extended GCD" | 
|  | * as it is described in section 3.1 of [1] (Algorithm 1). | 
|  | * [1] https://doi.org/10.1155/ES/2006/32192 | 
|  | * | 
|  | * Note that this is a variable time implementation. I.e. this routine will | 
|  | * show a data-dependent timing and execution profile. Only use where a | 
|  | * full white-box scenario is acceptable. | 
|  | * | 
|  | * Flags: Flags have no meaning beyond the scope of this subroutine. | 
|  | * | 
|  | * @param[in]  [w30, w29]: a, 384-bit operand | 
|  | * @param[in]  [w13, w12]: m, modulus | 
|  | * @param[in]  w31: all-zero | 
|  | * @param[out] [w17,w16]: result c | 
|  | * | 
|  | * clobbered registers: x2, w2, w4 to w11, w16 to w19 | 
|  | * clobbered flag groups: FG0 | 
|  | */ | 
|  | mod_inv_var: | 
|  | /* [w5,w4] = r <= 0 */ | 
|  | bn.xor    w4, w4, w4 | 
|  | bn.xor    w5, w5, w5 | 
|  |  | 
|  | /* [w7,w6] = s <= 1 */ | 
|  | bn.addi   w6, w31, 1 | 
|  | bn.xor    w7, w7, w7 | 
|  |  | 
|  | /* [w9,w8] = u <= m = [w13, w12]*/ | 
|  | bn.mov    w8, w12 | 
|  | bn.mov    w9, w13 | 
|  |  | 
|  | /* [w11,w10] = v <= [w30, w29] */ | 
|  | bn.mov    w10, w29 | 
|  | bn.mov    w11, w30 | 
|  |  | 
|  | ebgcd_loop: | 
|  | /* test if u is odd */ | 
|  | bn.or     w8, w8, w8 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 4 | 
|  | bne       x2, x0, ebgcd_u_odd | 
|  |  | 
|  | /* u is even: */ | 
|  | /* [w9,w8] = u <= u/2 = [w9,w8] >> 1 */ | 
|  | bn.rshi   w8, w9, w8 >> 1 | 
|  | bn.rshi   w9, w31, w9 >> 1 | 
|  |  | 
|  | /* test if r is odd */ | 
|  | bn.or     w4, w4, w4 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 4 | 
|  | bne       x2, x0, ebgcd_r_odd | 
|  |  | 
|  | /* r is even: */ | 
|  | /* [w5,w4] = r <= r/2 = [w5,w4] >> 1 */ | 
|  | bn.rshi   w4, w5, w4 >> 1 | 
|  | bn.rshi   w5, w31, w5 >> 1 | 
|  | jal       x0, ebgcd_loop | 
|  |  | 
|  | ebgcd_r_odd: | 
|  | /* [w5,w4] = r <= (r + m)/2 = ([w5,w4] + [w13,w12]) >> 1 */ | 
|  | bn.add    w4, w4, w12 | 
|  | bn.addc   w5, w5, w13 | 
|  | bn.rshi   w4, w5, w4 >> 1 | 
|  | bn.rshi   w5, w31, w5 >> 1 | 
|  | jal       x0, ebgcd_loop | 
|  |  | 
|  | ebgcd_u_odd: | 
|  | /* test if v is odd */ | 
|  | bn.or     w10, w10, w10 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 4 | 
|  | bne       x2, x0, ebgcd_uv_odd | 
|  |  | 
|  | /* v is even: */ | 
|  | /* [w11,w10] = v <= v/2 = [w11,w10] >> 1 */ | 
|  | bn.rshi   w10, w11, w10 >> 1 | 
|  | bn.rshi   w11, w31, w11 >> 1 | 
|  |  | 
|  | /* test if s is odd */ | 
|  | bn.or     w6, w6, w6 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 4 | 
|  | bne       x2, x0, ebgcd_s_odd | 
|  |  | 
|  | /* s is even: */ | 
|  | /* [w7,w6] = s <= s/2 = [w7,w6] >> 1 */ | 
|  | bn.rshi   w6, w7, w6 >> 1 | 
|  | bn.rshi   w7, w31, w7 >> 1 | 
|  | jal       x0, ebgcd_loop | 
|  |  | 
|  | ebgcd_s_odd: | 
|  | /* [w7,w6] = s <= (s + m)/2 = ([w7,w6] + [w13,w12]) >> 1 */ | 
|  | bn.add    w6, w6, w12 | 
|  | bn.addc   w7, w7, w13 | 
|  | bn.rshi   w6, w7, w6 >> 1 | 
|  | bn.rshi   w7, w31, w7 >> 1 | 
|  | jal       x0, ebgcd_loop | 
|  |  | 
|  | ebgcd_uv_odd: | 
|  | /* test if v >= u */ | 
|  | bn.cmp    w10, w8 | 
|  | bn.cmpb   w11, w9 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | beq       x2, x0, ebgcd_v_gte_u | 
|  |  | 
|  | /* u > v: */ | 
|  | /* [w5,w4] = r <= r - s = [w5,w4] - [w7,w6]; if (r < 0): r <= r + m */ | 
|  | bn.sub    w4, w4, w6 | 
|  | bn.subb   w5, w5, w7 | 
|  | bn.add    w18, w4, w12 | 
|  | bn.addc   w19, w5, w13 | 
|  | bn.sel    w4, w18, w4, C | 
|  | bn.sel    w5, w19, w5, C | 
|  |  | 
|  | /* [w9,w8] = u <= u - v = [w9,w8] - [w11,w10] */ | 
|  | bn.sub    w8, w8, w10 | 
|  | bn.subb   w9, w9, w11 | 
|  | jal       x0, ebgcd_loop | 
|  |  | 
|  | ebgcd_v_gte_u: | 
|  | /* [w7,w6] = s <= s - r = [w7,w6] - [w5,w4]; if (s < 0) s <= s + m */ | 
|  | bn.sub    w6, w6, w4 | 
|  | bn.subb   w7, w7, w5 | 
|  | bn.add    w18, w6, w12 | 
|  | bn.addc   w19, w7, w13 | 
|  | bn.sel    w6, w18, w6, C | 
|  | bn.sel    w7, w19, w7, C | 
|  |  | 
|  | /* [w11,w10] = v <= v - u = [w11,w10] - [w9,w8] */ | 
|  | bn.sub    w10, w10, w8 | 
|  | bn.subb   w11, w11, w9 | 
|  |  | 
|  | /* if v > 0 go back to start of loop */ | 
|  | bn.cmp    w31, w10 | 
|  | bn.cmpb   w31, w11 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | bne       x2, x0, ebgcd_loop | 
|  |  | 
|  | /* v <= 0: */ | 
|  | /* if (r > m): [w17,w16] = a <= r - m = [w5,w4] - [w13,w12] | 
|  | else: [w17,w16] = a <= r = [w5,w4] */ | 
|  | bn.sub    w18, w4, w12 | 
|  | bn.subb   w19, w5, w13 | 
|  | bn.cmp    w12, w4 | 
|  | bn.cmpb   w13, w5 | 
|  | bn.sel    w16, w18, w4, C | 
|  | bn.sel    w17, w19, w5, C | 
|  |  | 
|  | ret | 
|  |  | 
|  |  | 
|  | /** | 
|  | * Store curve point in projective coordinates (non randomized) | 
|  | * | 
|  | * Reads an affine P-384 from dmem, addressed by two independent pointers for | 
|  | * the affine x- and y-coordinate respectively and stores the same point in | 
|  | * projective form at another dmem location. The destination address is given | 
|  | * by a single pointer. All 3 coordinates (x,y,z) are consecutively stored in | 
|  | * this order in little endian format, 256 bit aligned. | 
|  | * | 
|  | * This routine does not randomize the point, hence the z-cooridnate is simply | 
|  | * set to 1. | 
|  | * | 
|  | * @param[in]  x10: dptr_x_a, pointer to affine x-coordinate of curve point | 
|  | * @param[in]  x11: dptr_y_a, pointer to affine y-coordinate of curve point | 
|  | * @param[in]  x12: dptr_proj, pointer to destination address | 
|  | * @param[in]  w31: all-zero | 
|  | * @param[out] x12: next dmem address after stored point (256-bit aligned) | 
|  | * | 
|  | * Flags: Flags have no meaning beyond the scope of this subroutine. | 
|  | * | 
|  | * clobbered registers: x2, x12, w6 to w11 | 
|  | * clobbered flag groups: FG0 | 
|  | */ | 
|  | store_aff_proj: | 
|  |  | 
|  | /* load point */ | 
|  | li        x2, 6 | 
|  | bn.lid    x2++, 0(x10) | 
|  | bn.lid    x2++, 32(x10) | 
|  | bn.lid    x2++, 0(x11) | 
|  | bn.lid    x2++, 32(x11) | 
|  | bn.addi   w10, w31, 1 | 
|  | bn.xor    w11, w11, w11 | 
|  |  | 
|  | /* store point */ | 
|  | li        x2, 6 | 
|  | loopi 6, 2 | 
|  | bn.sid    x2, 0(x12++) | 
|  | addi      x2, x2, 1 | 
|  | nop | 
|  |  | 
|  | ret | 
|  |  | 
|  |  | 
|  | /** | 
|  | * Store curve point in projective coordinates (non randomized) | 
|  | * | 
|  | * Stores a P-384 curve point located in 6 consecutive WDRs at a dmem location | 
|  | * given by a pointer. All 3 coordinates (x,y,z) are consecutively stored in | 
|  | * this order in little endian format, 256 bit aligned. | 
|  | * | 
|  | * This routine does not randomize the point. | 
|  | * | 
|  | * @param[in]  x12: dptr_proj, pointer to destination address | 
|  | * @param[in]  [w26,w25]: x-coordinate of curve point | 
|  | * @param[in]  [w28,w27]: y-coordinate of curve point | 
|  | * @param[in]  [w30,w29]: z-coordinate of curve point | 
|  | * @param[out] x12: next dmem address after stored point (256-bit aligned) | 
|  | * | 
|  | * Flags: This routine doe not set any flags. | 
|  | * | 
|  | * clobbered registers: x2, x12 | 
|  | * clobbered flag groups: none | 
|  | */ | 
|  | store_proj: | 
|  | li        x2, 25 | 
|  | loopi 6, 2 | 
|  | bn.sid    x2, 0(x12++) | 
|  | addi      x2, x2, 1 | 
|  | nop | 
|  | ret | 
|  |  | 
|  | /** | 
|  | * P-384 ECDSA signature verification | 
|  | * | 
|  | * returns the affine x-coordinate of | 
|  | *         (x1, y1) = u1*G + u2*Q | 
|  | *         with u1 = z*s^-1 mod n  and  u2 = r*s^-1 mod n | 
|  | *         where G is the curve's base point, | 
|  | *               z is the message | 
|  | *               r, s is the signature | 
|  | *               Q is the public key. | 
|  | * | 
|  | * The routine computes the x1 coordinate and places it in dmem. x1 will be | 
|  | * reduced (mod n), however, the final comparison has to be performed on the | 
|  | * host side. The signature is valid if x1 == r. | 
|  | * This routine runs in variable time. | 
|  | * | 
|  | * @param[in]  dmem[4]: dptr_rnd, pointer to dmem location where the reduced | 
|  | *                           affine x1-coordinate will be stored | 
|  | * @param[in]  dmem[8]: dptr_msg, pointer to the message to be verified in dmem | 
|  | * @param[in]  dmem[12]: dptr_r, pointer to r of signature in dmem | 
|  | * @param[in]  dmem[16]: dptr_s, pointer to s of signature in dmem | 
|  | * @param[in]  dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem | 
|  | * @param[in]  dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem | 
|  | * | 
|  | * Scratchpad memory layout: | 
|  | * The routine expects at least 896 bytes of scratchpad memory at dmem | 
|  | * location 'scratchpad' (sp). Internally the scratchpad is used as follows: | 
|  | * dptr_sp     .. dptr_sp+191: point C, projective | 
|  | * dptr_sp+192 .. dptr_sp+383: point G, projective | 
|  | * dptr_sp+384 .. dptr_sp+575: point Q, projective | 
|  | * dptr_sp+576 .. dptr_sp+767: point Q+G, projective | 
|  | * dptr_sp+768 .. dptr_sp+831: scalar u1 | 
|  | * dptr_sp+832 .. dptr_sp+896: scalar u2 | 
|  | * | 
|  | * Flags: Flags have no meaning beyond the scope of this subroutine. | 
|  | * | 
|  | * clobbered registers: x2 to x5, x10, x11, x12, x22 to 28, w0 to w31 | 
|  | * clobbered flag groups: FG0 | 
|  | */ | 
|  | .globl p384_verify | 
|  | p384_verify: | 
|  |  | 
|  | /* init all-zero reg */ | 
|  | bn.xor    w31, w31, w31 | 
|  |  | 
|  | /* load domain parameter n (order of base point) | 
|  | [w13, w12] <= n = dmem[p384_n] */ | 
|  | li        x2, 12 | 
|  | la        x3, p384_n | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* load s of signature from dmem | 
|  | [w30,w29] <= s = dmem[*dptr_s] */ | 
|  | li        x2, 29 | 
|  | la        x3, dptr_s | 
|  | lw        x3, 0(x3) | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* goto 'fail' if [w30,w29] == [w31, w31] <=> s == 0 */ | 
|  | bn.cmp    w31, w29 | 
|  | bn.cmpb   w31, w30 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | beq       x2, x0, fail | 
|  |  | 
|  | /* goto 'fail' if [w30,w29] >= [w12,w13] <=> s >= n */ | 
|  | bn.cmp    w29, w12 | 
|  | bn.cmpb   w30, w13 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | beq       x2, x0, fail | 
|  |  | 
|  | /* Compute modular inverse of S | 
|  | Note: This can be replaced by the 'mod_inv_n_p384' subroutine at the | 
|  | cost of ~60k cycles if reduced code size is targeted */ | 
|  | /* [w9,w8] <= [w17,w16] <= s^-1  mod n = [w30,w29]^-1 mod [w13,w12] */ | 
|  | jal       x1, mod_inv_var | 
|  | bn.mov    w8, w16 | 
|  | bn.mov    w9, w17 | 
|  |  | 
|  | /* load Barrett constant u_n for modulus n for scalar operations | 
|  | [w15, w14] <= u_n = dmem[p384_u_n] */ | 
|  | li        x2, 14 | 
|  | la        x3, p384_u_n | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* set regfile pointers to in/out regs of Barrett routine */ | 
|  | li        x22, 10 | 
|  | li        x23, 11 | 
|  | li        x24, 16 | 
|  | li        x25, 17 | 
|  |  | 
|  | /* load r of signature from dmem | 
|  | [w11,w10] <= r = dmem[*dptr_r] */ | 
|  | li        x2, 10 | 
|  | la        x3, dptr_r | 
|  | lw        x3, 0(x3) | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* goto 'fail' if [w11, w10] == [w31, w31] <=> r == 0 */ | 
|  | bn.cmp    w31, w10 | 
|  | bn.cmpb   w31, w11 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | beq       x2, x0, fail | 
|  |  | 
|  | /* goto 'fail' if [w11,w10] >= [w12,w13] <=> r >= n */ | 
|  | bn.cmp    w10, w12 | 
|  | bn.cmpb   w11, w13 | 
|  | csrrs     x2, 0x7c0, x0 | 
|  | andi      x2, x2, 1 | 
|  | beq       x2, x0, fail | 
|  |  | 
|  | /* u2 = [w3,w2] <= [w17,w16] <= r*s^-1 mod n | 
|  | = [w11,w10]*[w17,w16] mod [w13,w12] */ | 
|  | jal x1, barrett384_p384 | 
|  | bn.mov    w2, w16 | 
|  | bn.mov    w3, w17 | 
|  | /* left align */ | 
|  | bn.rshi   w3, w3, w2 >> 128 | 
|  | bn.rshi   w2, w2, w31 >> 128 | 
|  |  | 
|  | /* load message from dmem | 
|  | [w11,w10] <= msg = dmem[*dptr_msg] */ | 
|  | li        x2, 10 | 
|  | la        x3, dptr_msg | 
|  | lw        x3, 0(x3) | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* u1 = [w1,w0] <= [w17,w16] <= msg*s^-1 mod n | 
|  | = [w11,w10]*[w9,w8] mod [w13,w12] */ | 
|  | bn.mov    w16, w8 | 
|  | bn.mov    w17, w9 | 
|  | jal       x1, barrett384_p384 | 
|  | bn.mov    w0, w16 | 
|  | bn.mov    w1, w17 | 
|  | /* left align */ | 
|  | bn.rshi   w1, w1, w0 >> 128 | 
|  | bn.rshi   w0, w0, w31 >> 128 | 
|  |  | 
|  | /* store u1 and u2 in scratchpad | 
|  | scratchpad[768] <= u1; scratchpad[832] <= u2 */ | 
|  | li        x2, 0 | 
|  | la        x26, scratchpad | 
|  | bn.sid    x2++, 768(x26) | 
|  | bn.sid    x2++, 800(x26) | 
|  | bn.sid    x2++, 832(x26) | 
|  | bn.sid    x2++, 864(x26) | 
|  |  | 
|  | /* load domain parameter p (modulus) | 
|  | [w13, w12] <= p = dmem[p384_p] */ | 
|  | li        x2, 12 | 
|  | la        x3, p384_p | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* load Barrett constant u_p for modulus p | 
|  | [w15, w14] = u_p = dmem[p384_u_p] */ | 
|  | li        x2, 14 | 
|  | la        x3, p384_u_p | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* set dmem pointer to domain parameter b */ | 
|  | la        x28, p384_b | 
|  |  | 
|  | /* init double and add algorithm with C = (0, 1, 0) | 
|  | GQ = (x,y,z) = scratchpad[0] <= (0, 1, 0) */ | 
|  | bn.xor    w25, w25, w25 | 
|  | bn.xor    w26, w26, w26 | 
|  | bn.addi   w27, w31, 1 | 
|  | bn.xor    w28, w28, w28 | 
|  | bn.xor    w29, w29, w29 | 
|  | bn.xor    w30, w30, w30 | 
|  | la        x12, scratchpad | 
|  | jal       x1, store_proj | 
|  |  | 
|  | /* load base point G and use in projective form (set z to 1) | 
|  | G = (x,y,z) = scratchpad[192] <= (dmem[p384_gy], dmem[p384_gy], 1) */ | 
|  | la        x10, p384_gx | 
|  | la        x11, p384_gy | 
|  | jal       x1, store_aff_proj | 
|  |  | 
|  | /* load public key Q from dmem and use in projective form (set z to 1) | 
|  | Q = (x,y,z) = scratchpad[384] <= (dmem[*dptr_x], dmem[*dptr_y], 1) */ | 
|  | la        x3, dptr_x | 
|  | lw        x10, 0(x3) | 
|  | la        x3, dptr_y | 
|  | lw        x11, 0(x3) | 
|  | jal       x1,  store_aff_proj | 
|  |  | 
|  | /* The remaining part of the routine implements a variable time | 
|  | double-and-add algorithm. For the signature verification we need to | 
|  | compute the point C = (x1, y1) = u1*G + _2*Q. This can be done in a | 
|  | single double-and-add routine by using Shamir's Trick. */ | 
|  |  | 
|  | /* Compute G+Q and store in dmem | 
|  | GQ = (x,y,z) = dmem[dptr_sp+576] | 
|  | <= sp[dptr_sp+192] (+) dmem[dptr_sp+384] */ | 
|  | la        x26, scratchpad | 
|  | addi      x27, x26, 384 | 
|  | addi      x26, x26, 192 | 
|  | jal       x1, proj_add_p384 | 
|  | jal       x1, store_proj | 
|  |  | 
|  | la        x26, scratchpad | 
|  |  | 
|  | /* main loop with decreasing index i (i=383 downto 0) */ | 
|  | loopi     384, 35 | 
|  |  | 
|  | /* probe MSBs of u1 and u2 and u1|u2 to determine which point has to be | 
|  | added. */ | 
|  |  | 
|  | /* load u1 and u2 from scratchpad | 
|  | [w1,w0] <= u1; [w3, w2] = u2 */ | 
|  | li        x2, 0 | 
|  | bn.lid    x2++, 768(x26) | 
|  | bn.lid    x2++, 800(x26) | 
|  | bn.lid    x2++, 832(x26) | 
|  | bn.lid    x2++, 864(x26) | 
|  |  | 
|  | /* left shift u1 = [w1,w0] <= [w1,w0] << 1 */ | 
|  | bn.add    w0, w0, w0 | 
|  | bn.addc   w1, w1, w1 | 
|  |  | 
|  | /* keep MSB/carry bit in x3: x3 <= u1[i] */ | 
|  | csrrs     x3, 0x7c0, x0 | 
|  | andi      x3, x3, 1 | 
|  |  | 
|  | /* left shift u2 = [w3,w2] <= [w3,w2] << 1 */ | 
|  | bn.add    w2, w2, w2 | 
|  | bn.addc   w3, w3, w3 | 
|  |  | 
|  | /* keep MSB/carry bit in x3: x4 <= u2[i] */ | 
|  | csrrs     x4, 0x7c0, x0 | 
|  | andi      x4, x4, 1 | 
|  | li        x2, 0 | 
|  |  | 
|  | /* write back u1 and u2 to scratchpad */ | 
|  | bn.sid    x2++, 768(x26) | 
|  | bn.sid    x2++, 800(x26) | 
|  | bn.sid    x2++, 832(x26) | 
|  | bn.sid    x2++, 864(x26) | 
|  |  | 
|  | /* test if at least one MSb of the scalars is 1 | 
|  | x5 <= x4 | x3 = u1[i] | u2[i] */ | 
|  | or        x5, x4, x3 | 
|  |  | 
|  | /* always double, let both input pointers for point addition point to C */ | 
|  | add       x27, x26, x0 | 
|  |  | 
|  | /* no addition if x5 = u1[i] | u2[i] == 0 */ | 
|  | beq       x5, x0, ver_end_loop | 
|  |  | 
|  | /* perform point doubling C <= 2 (*) C */ | 
|  | jal       x1, proj_add_p384 | 
|  | addi      x12, x26, 0 | 
|  | jal       x1, store_proj | 
|  |  | 
|  | /* check if u1[i] is set */ | 
|  | bne       x3, x0, u1_set | 
|  |  | 
|  | /* only u2[i] is set: do C <= C + Q */ | 
|  | addi      x27, x26, 384 | 
|  | jal       x0, ver_end_loop | 
|  |  | 
|  | u1_set: | 
|  | /* chek if u2[i] is set as well */ | 
|  | bne       x4, x0, both | 
|  |  | 
|  | /* only u1[i] is set: do C <= C + G */ | 
|  | add       x27, x26, 192 | 
|  | jal       x0, ver_end_loop | 
|  |  | 
|  | /* both bits at current index (u1[i] and u2[i]) are set: | 
|  | do: C <= C + (G + Q) */ | 
|  | both: | 
|  | addi      x27, x26, 576 | 
|  |  | 
|  | ver_end_loop: | 
|  | /* perform addition of selected point here, or point doubling in case | 
|  | of no addition */ | 
|  | jal       x1, proj_add_p384 | 
|  | addi      x12, x26, 0 | 
|  | jal       x1, store_proj | 
|  | nop | 
|  |  | 
|  | /* compute inverse of z-coordinate: [w1,w0] <= z_c^-1  mod p */ | 
|  | jal       x1, mod_inv_var | 
|  |  | 
|  | /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1  mod p */ | 
|  | bn.mov    w10, w25 | 
|  | bn.mov    w11, w26 | 
|  | jal x1, barrett384_p384 | 
|  |  | 
|  | /* load domain parameter n (order of base point) | 
|  | [w13, w12] <= n = dmem[p384_n] */ | 
|  | li        x2, 12 | 
|  | la        x3, p384_n | 
|  | bn.lid    x2++, 0(x3) | 
|  | bn.lid    x2++, 32(x3) | 
|  |  | 
|  | /* final reduction: [w5,w4] = x1 <= x1 mod n = [w17,w16] mod [w13,w12] */ | 
|  | bn.sub    w4, w16, w12 | 
|  | bn.subb   w5, w17, w13 | 
|  | bn.sel    w4, w16, w4, C | 
|  | bn.sel    w5, w17, w5, C | 
|  |  | 
|  | fail: | 
|  |  | 
|  | /* store affine x-coordinate in dmem: dmem[dptr_rnd] <= x1 = [w5,w4] */ | 
|  | li        x2, 4 | 
|  | la        x3, dptr_rnd | 
|  | lw        x3, 0(x3) | 
|  | bn.sid    x2++, 0(x3) | 
|  | bn.sid    x2++, 32(x3) | 
|  |  | 
|  | ret | 
|  |  | 
|  |  | 
|  | /* pointers and scratchpad memory */ | 
|  | .section .data | 
|  |  | 
|  | /* pointer to k (dptr_k) */ | 
|  | .globl dptr_k | 
|  | dptr_k: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to rnd (dptr_rnd) | 
|  | used for result here */ | 
|  | .globl dptr_rnd | 
|  | dptr_rnd: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to msg (dptr_msg) */ | 
|  | .globl dptr_msg | 
|  | dptr_msg: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to R (dptr_r) */ | 
|  | .globl dptr_r | 
|  | dptr_r: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to S (dptr_s) */ | 
|  | .globl dptr_s | 
|  | dptr_s: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to X (dptr_x) */ | 
|  | .globl dptr_x | 
|  | dptr_x: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to Y (dptr_y) */ | 
|  | .globl dptr_y | 
|  | dptr_y: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to D (dptr_d) */ | 
|  | .globl dptr_d | 
|  | dptr_d: | 
|  | .zero 4 | 
|  |  | 
|  | /* 768 bytes of scratchpad memory */ | 
|  | scratchpad: | 
|  | .zero 896 |