[sw/silicon-creator] Use sw/otbn ECDSA assembly files instead of local copies. Signed-off-by: Jade Philipoom <jadep@google.com>
diff --git a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/LICENSE.dcrypto b/sw/device/silicon_creator/lib/crypto/ecdsa_p256/LICENSE.dcrypto deleted file mode 100644 index d3295c3..0000000 --- a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/LICENSE.dcrypto +++ /dev/null
@@ -1,27 +0,0 @@ -Copyright 2010 The Chromium OS Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. -* Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/meson.build b/sw/device/silicon_creator/lib/crypto/ecdsa_p256/meson.build index c2e2586..5922262 100644 --- a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/meson.build +++ b/sw/device/silicon_creator/lib/crypto/ecdsa_p256/meson.build
@@ -2,68 +2,6 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 -# Create otbn sources dictionary -sw_lib_crypto_ecdsa_p256_otbn_sources = { - 'p256': files( - 'p256.s' - ), - 'p256_ecdsa': files( - 'p256_ecdsa.s', - 'p256.s', - ), -} - -# OTBN BUILD procedure -sw_lib_crypto_ecdsa_p256_otbn = {} -foreach app_name, app_sources : sw_lib_crypto_ecdsa_p256_otbn_sources - # Output files generated by the otbn_build.py script. - app_output_files = [ - app_name + '.rv32embed.o', - app_name + '.elf', - ] - - # Target calling otbn_build.py - target = custom_target( - 'sw_lib_crypto_otbn_apps_' + app_name + '_target', - input: app_sources, - output: app_output_files, - command: otbn_build_command, - depend_files: [otbn_build_depend_files,], - ) - - # A library containing the OTBN application in a form embeddable into device - # (Ibex) software (the *.rv32embed.o file). - embedded_lib = static_library( - app_name, - [target[0]] # == app_output_files[0], i.e. *.rv32embed.o - ) - - # A dependency on the application as embeddable library, to be used if - # device (Ibex) software wants to include an OTBN application in its binary. - app_dependency = declare_dependency( - link_with: embedded_lib, - ) - - sw_lib_crypto_ecdsa_p256_otbn += { - app_name: { - 'elf': target[1], - 'rv32embed_lib': embedded_lib, - 'rv32embed_dependency': app_dependency, - } - } - - custom_target( - 'sw_lib_crypto_ecdsa_p256_otbn_app_export_' + app_name, - command: export_target_command, - depend_files: [export_target_depend_files,], - input: [target[1]], - output: 'sw_lib_crypto_ecdsa_p256_otbn_app_export_' + app_name, - build_always_stale: true, - build_by_default: true, - ) - -endforeach - # C wrapper for ECDSA sign/verify sw_silicon_creator_lib_crypto_ecdsa_p256 = declare_dependency( link_with: static_library( @@ -73,7 +11,7 @@ ], dependencies: [ sw_silicon_creator_lib_otbn_util, - sw_lib_crypto_ecdsa_p256_otbn['p256_ecdsa']['rv32embed_dependency'], + sw_otbn['p256_ecdsa']['rv32embed_dependency'], ], ), )
diff --git a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256.s b/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256.s deleted file mode 100644 index 4068c70..0000000 --- a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256.s +++ /dev/null
@@ -1,1905 +0,0 @@ -/* Copyright lowRISC Contributors. - * Copyright 2016 The Chromium OS Authors. All rights reserved. - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE.dcrypto file. - * - * Derived from code in - * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c - */ - -.globl p256_isoncurve -.globl p256_scalar_mult -.globl p256_base_mult -.globl p256_sign -.globl p256_verify -.globl proj_add - -.text - -/** - * 256-bit modular multiplication based on Barrett reduction algorithm. - * - * Returns c = a * b mod p - * - * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u - * for Barrett reduction (usually greek mu in literature). u is expected - * without the leading 1 at bit 256. u has to be pre-computed as - * u = floor(2^512/p). - * This guarantees that u > 2^256, however, in order for u to be at - * most 2^257-1, it has to be ensured that p >= 2^255 + 1. - * - * This implementation mostly follows the description in the - * "Handbook of Applied Cryptography" in Algorithm 14.42. - * Differences: - * - This implementation incorporates a multiplication before the reduction. - * Therefore it expects two operands (a, b) instead of a wider integer x. - * - The computation of q2 ignores the MSbs of q1 and u to allow using - * a 256x256 bit multiplication. This is compensated later by - * individual (conditional) additions. - * - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) ) - * are not implemented here and the full register width is used. This - * allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42 - * - There is only a single conditional subtraction of the modulus at the end - * - * Note that this implementation is targeted and tested to be used with modulus - * and Barrett constant of the P-256 underlying finite field only. For a - * generic modulus a 2nd conditional subtraction of the modulus has to be - * added or the modulus has to be in a range such that it can be mathematically - * proven that a single subtraction is sufficient. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] w24: a, first 256 bit operand (a * b < 2^256 * p) - * @param[in] w25: b, second 256 bit operand (a * b < 2^256 * p) - * @param[in] w29: p, modulus of P-256 underlying finite field - * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 - * @param[in] w31: all-zero - * @param[in] MOD: p, modulus of P-256 underlying finite field - * @param[out] w19: c, result - * - * clobbered registers: w19, w20, w21, w22, w23, w24, w25 - * clobbered flag groups: FG0 - */ -mod_mul_256x256: - /* Compute the integer product of the operands x = a * b - x = [w20, w19] = a * b = w24 * w25 - => max. length x: 512 bit */ - bn.mulqacc.z w24.0, w25.0, 0 - bn.mulqacc w24.1, w25.0, 64 - bn.mulqacc.so w19.L, w24.0, w25.1, 64 - bn.mulqacc w24.2, w25.0, 0 - bn.mulqacc w24.1, w25.1, 0 - bn.mulqacc w24.0, w25.2, 0 - bn.mulqacc w24.3, w25.0, 64 - bn.mulqacc w24.2, w25.1, 64 - bn.mulqacc w24.1, w25.2, 64 - bn.mulqacc.so w19.U, w24.0, w25.3, 64 - bn.mulqacc w24.3, w25.1, 0 - bn.mulqacc w24.2, w25.2, 0 - bn.mulqacc w24.1, w25.3, 0 - bn.mulqacc w24.3, w25.2, 64 - bn.mulqacc.so w20.L, w24.2, w25.3, 64 - bn.mulqacc.so w20.U, w24.3, w25.3, 0 - bn.add w20, w20, w31 - - /* Store correction factor to compensate for later neglected MSb of x. - x is 512 bit wide and therefore the 255 bit right shifted version q1 - (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a - 256x256 multiplier. For the MSb of x being set we temporary store u - (or zero) here to be used in a later constant time correction of a - multiplication with u. Note that this requires the MSb flag being carried - over from the multiplication routine. */ - bn.sel w22, w28, w31, M - - /* Compute q1' = q1[255:0] = x >> 255 - w21 = q1' = [w20, w19] >> 255 */ - bn.rshi w21, w20, w19 >> 255 - - /* Compute q2 = q1*u - Instead of full q2 (which would be up to 514 bits) we ignore the MSb of u - and the MSb of q1 and correct this later. This allows using a 256x256 - multiplier. - => max. length q2': 512 bit - q2' = q1[255:0]*u[255:0] = [w20, w19] = w21 * w28 */ - bn.mulqacc.z w21.0, w28.0, 0 - bn.mulqacc w21.1, w28.0, 64 - bn.mulqacc.so w23.L, w21.0, w28.1, 64 - bn.mulqacc w21.2, w28.0, 0 - bn.mulqacc w21.1, w28.1, 0 - bn.mulqacc w21.0, w28.2, 0 - bn.mulqacc w21.3, w28.0, 64 - bn.mulqacc w21.2, w28.1, 64 - bn.mulqacc w21.1, w28.2, 64 - bn.mulqacc.so w23.U, w21.0, w28.3, 64 - bn.mulqacc w21.3, w28.1, 0 - bn.mulqacc w21.2, w28.2, 0 - bn.mulqacc w21.1, w28.3, 0 - bn.mulqacc w21.3, w28.2, 64 - bn.mulqacc.so w24.L, w21.2, w28.3, 64 - bn.mulqacc.so w24.U, w21.3, w28.3, 0 - bn.add w20, w20, w31 - - /* q3 = q2 >> 257 - In this step, the compensation for the neglected MSbs of q1 and u is - carried out underway. To add them in the q2 domain, they would have to be - left shifted by 256 bit first. To directly add them we first shift q2' by - 256 bit to the right (by dropping the lower 256 bit), perform the - additions, and shift the result another bit to the right. - - /* w25 = q1[256] = x[511:256] >> 255 */ - bn.rshi w25, w31, w20 >> 255 - - /* compensate for neglected MSB of u, by adding full q1 - this is unconditional since MSB of u is always 1 - [w25, w24] = q2'' <= q2'[511:256] + q1 = [w25, w24] <= w24 + [w25, w21] */ - bn.add w24, w24, w21 - bn.addc w25, w25, w31 - - /* compensate for neglected MSB of q1, by conditionally adding u */ - /* [w25, w24] = q2''' <= q2'' + [0 or u] = [w25, w24] + w22 */ - bn.add w24, w24, w22 - bn.addc w25, w25, w31 - - /* q3 = w21 = q2''' >> 1 */ - bn.rshi w21, w25, w24 >> 1 - - /* [w23, w22] <= q3 * p */ - bn.mulqacc.z w29.0, w21.0, 0 - bn.mulqacc w29.1, w21.0, 64 - bn.mulqacc.so w22.L, w29.0, w21.1, 64 - bn.mulqacc w29.2, w21.0, 0 - bn.mulqacc w29.1, w21.1, 0 - bn.mulqacc w29.0, w21.2, 0 - bn.mulqacc w29.3, w21.0, 64 - bn.mulqacc w29.2, w21.1, 64 - bn.mulqacc w29.1, w21.2, 64 - bn.mulqacc.so w22.U, w29.0, w21.3, 64 - bn.mulqacc w29.3, w21.1, 0 - bn.mulqacc w29.2, w21.2, 0 - bn.mulqacc w29.1, w21.3, 0 - bn.mulqacc w29.3, w21.2, 64 - bn.mulqacc.so w23.L, w29.2, w21.3, 64 - bn.mulqacc.so w23.U, w29.3, w21.3, 0 - bn.add w23, w23, w31 - - /* We compute the final remainder r by subtracting the estimate q3 from x. - In the generic algorithm, r is already the reduced result or it is off by - either p or 2p. For the special case of the modulus of P-256 it can be - shown that r can only be off by max p. Therefore, only a single - conditional correction is required. - [w20, w22] = r <= [w20, w19] - [w23, w22] = x - q3*p */ - bn.sub w22, w19, w22 - bn.subb w20, w20, w23 - - /* r cannot be wider than 257 bit. Therefore it is r > p if bit 256 of r is - set and we need to subtract the modulus */ - bn.sel w21, w29, w31, L - bn.sub w21, w22, w21 - - /* this performs the correction in case r is only 256 bit long but still - greater than the modulus */ - bn.addm w19, w21, w31 - - ret - - -/** - * Checks if a point is a valid curve point on curve P-256 (secp256r1) - * - * Returns r = x^3 + ax + b mod p - * and s = y^2 mod p - * with x,y being the affine coordinates of the curve point - * a, b and p being the domain parameters of P-256 - * - * This routine checks if a point with given x- and y-coordinate is a valid - * curve point on P-256. - * The routine checks whether the coordinates are a solution of the - * Weierstrass equation y^2 = x^3 + ax + b mod p. - * The routine makes use of the property that the domain parameter 'a' can be - * written as a=-3 for the P-256 curve, hence the routine is limited to P-256. - * The routine does not return a boolean result but computes the left side - * and the right sight of the Weierstrass equation and leaves the final - * comparison to the caller. - * The routine runs in constant time. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] dmem[12]: dptr_r, pointer to dmem location where right - * side result r will be stored - * @param[in] dmem[16]: dptr_s, pointer to dmem location where left side - * result s will be stored - * @param[in] dmem[20]: dptr_x, pointer to dmem location containing affine - * x-coordinate of input point - * @param[in] dmem[24]: dptr_y, pointer to dmem location containing affine - * y-coordinate of input point - * - * clobbered registers: x2, x3, x19, x20, w0, w19 to w25 - * clobbered flag groups: FG0 - */ -p256_isoncurve: - - /* setup all-zero reg */ - bn.xor w31, w31, w31 - - /* load dmem pointer to signature r in dmem: x19 <= dptr_r = dmem[12] */ - la x19, dptr_r - lw x19, 0(x19) - - /* load dmem pointer to signature s in dmem: x20 <= dptr_s = dmem[16] */ - la x20, dptr_s - lw x20, 0(x20) - - /* setup modulus p and Barrett constant u - MOD <= w29 <= dmem[p256_p] = p; w28 <= dmem[p256_u_p] = u_p */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) - - /* load domain parameter b from dmem - w27 <= b = dmem[p256_b] */ - li x2, 27 - la x3, p256_b - bn.lid x2, 0(x3) - - /* load affine y-coordinate of curve point from dmem - w26 <= dmem[dptr_y] = dmem[24] */ - la x3, dptr_y - lw x3, 0(x3) - li x2, 24 - bn.lid x2, 0(x3) - - /* w19 <= y^2 = w24*w24 */ - bn.mov w25, w24 - jal x1, mod_mul_256x256 - - /* store left side result: dmem[dptr_s] <= w0 = y^2 mod p */ - li x2, 19 - bn.sid x2, 0(x20) - - /* load affine x-coordinate of curve point from dmem - w26 <= dmem[dptr_x] = dmem[20] */ - la x3, dptr_x - lw x3, 0(x3) - li x2, 26 - bn.lid x2, 0(x3) - - /* w19 <= x^2 = w26*w26 */ - bn.mov w25, w26 - bn.mov w24, w26 - jal x1, mod_mul_256x256 - - /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */ - bn.mov w25, w19 - bn.mov w24, w26 - jal x1, mod_mul_256x256 - - /* for curve P-256, 'a' can be written as a = -3, therefore we subtract - x three times from x^3. - w19 = x^3 + ax <= x^3 - 3x mod p */ - bn.subm w19, w19, w26 - bn.subm w19, w19, w26 - bn.subm w19, w19, w26 - - /* w24 <= x^3 + ax + b mod p = w19 + w27 mod p */ - bn.addm w19, w19, w27 - - /* store right side result: dmem[dptr_s] <= w19 = x^3 + ax + b mod p */ - li x2, 19 - bn.sid x2, 0(x19) - - ret - - -/** - * P-256 point addition in projective coordinates - * - * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q) - * with R, P and Q being valid P-256 curve points - * in projective coordinates - * - * This routine adds two valid P-256 curve points in projective space. - * Point addition is performed based on the complete formulas of Bosma and - * Lenstra for Weierstrass curves as first published in [1] and - * optimized in [2]. - * The implemented version follows Algorithm 4 of [2] which is an optimized - * variant for Weierstrass curves with domain parameter 'a' set to a=-3. - * Numbering of the steps below and naming of symbols follows the - * terminology of Algorithm 4 of [2]. - * The routine is limited to P-256 curve points due to: - * - fixed a=-3 domain parameter - * - usage of a P-256 optimized Barrett multiplication kernel - * This routine runs in constant time. - * - * [1] https://doi.org/10.1006/jnth.1995.1088 - * [2] https://doi.org/10.1007/978-3-662-49890-3_16 - * - * @param[in] w8: x_p, x-coordinate of input point P - * @param[in] w9: y_p, y-coordinate of input point P - * @param[in] w10: z_p, z-coordinate of input point P - * @param[in] w11: x_q, x-coordinate of input point Q - * @param[in] w12: y_q, x-coordinate of input point Q - * @param[in] w13: z_q, x-coordinate of input point Q - * @param[in] w27: b, curve domain parameter - * @param[in] w29: p, modulus, 2^256 > p > 2^255. - * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb - * of u which is always 1 for the allowed range. - * @param[in] w31: all-zero. - * @param[in] MOD: p, modulus, 2^256 > p > 2^255. - * @param[out] w11: x_r, x-coordinate of resulting point R - * @param[out] w12: y_r, x-coordinate of resulting point R - * @param[out] w13: z_r, x-coordinate of resulting point R - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: w11 to w25 - * clobbered flag groups: FG0 - */ -proj_add: - /* mapping of parameters to symbols of [2] (Algorithm 4): - X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q - X3 = x_r; Y3 = y_r; Z3 = z_r */ - - /* 1: w14 = t0 <= X1*X2 = w11*w8 */ - bn.mov w24, w11 - bn.mov w25, w8 - jal x1, mod_mul_256x256 - bn.mov w14, w19 - - /* 2: w15 = t1 <= Y1*Y2 = w12*w9 */ - bn.mov w24, w12 - bn.mov w25, w9 - jal x1, mod_mul_256x256 - bn.mov w15, w19 - - /* 3: w16 = t2 <= Z1*Z2 = w13*w10*/ - bn.mov w24, w13 - bn.mov w25, w10 - jal x1, mod_mul_256x256 - bn.mov w16, w19 - - /* 5: w17 = t4 <= X2+Y2 = w11 + w12 */ - bn.addm w17, w11, w12 - - /* 4: w18 = t3 <= X1+Y1 = w8+w9 */ - bn.addm w18, w8, w9 - - /* 6: w19 = t3 <= t3*t4 = w18*w17 */ - bn.mov w24, w17 - bn.mov w25, w18 - jal x1, mod_mul_256x256 - - /* 7: w18 = t4 <= t0+t1 = w14+w15 */ - bn.addm w18, w14, w15 - - /* 8: w17 = t3 <= t3 - t4 = w19 - w18 */ - bn.subm w17, w19, w18 - - /* 10: w18 = X3 <= Y2 + Z2 = w12 + w13 */ - bn.addm w18, w12, w13 - - /* 9: w19 = t4 <= Y1 + Z1 = w9 + w10 */ - bn.addm w19, w9, w10 - - /* 11: w18 = t4 <= t4 * X3 = w19 * w18 */ - bn.mov w24, w18 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - bn.mov w18, w19 - - /* 12: w19 = X3 <= t1 + t2 = w15 + w16 */ - bn.addm w19, w15, w16 - - /* 13: w18 = t4 <= t4 - X3 = w18 + w19 */ - bn.subm w18, w18, w19 - - /* 15: w19 = Y3 <= X2 + Z2 = w11 + w13 */ - bn.addm w19, w11, w13 - - /* 14: w12 = X3 <= X1 + Z1 = w8 + w10 */ - bn.addm w12, w8, w10 - - /* 16: w11 = X3 <= X3 * Y3 = w12 * w19 */ - bn.mov w24, w19 - bn.mov w25, w12 - jal x1, mod_mul_256x256 - bn.mov w11, w19 - - /* 17: w12 = Y3 <= t0 + t2 = w14 + w16 */ - bn.addm w12, w14, w16 - - /* 18: w12 = Y3 <= X3 - Y3 = w11 - w12 */ - bn.subm w12, w11, w12 - - /* 19: w19 = Z3 <= b * t2 = w27 * w16 */ - bn.mov w24, w27 - bn.mov w25, w16 - jal x1, mod_mul_256x256 - - /* 20: w11 = X3 <= Y3 -Z3 = w12 - w19 */ - bn.subm w11, w12, w19 - - /* 21: w13 = Z3 <= X3 + X3 = w11 + w11 */ - bn.addm w13, w11, w11 - - /* 22: w11 = X3 <= w11 + w13 = X3 + Z3 */ - bn.addm w11, w11, w13 - - /* 23: w13 = Z3 <= t1 - X3 = w15 - w11 */ - bn.subm w13, w15, w11 - - /* 24: w11 = X3 <= t1 + X3 = w15 + w11 */ - bn.addm w11, w15, w11 - - /* 25: w19 = Y3 <= w27 * w12 = b * Y3 */ - bn.mov w24, w27 - bn.mov w25, w12 - jal x1, mod_mul_256x256 - - /* 26: w15 = t1 <= t2 + t2 = w16 + w16 */ - bn.addm w15, w16, w16 - - /* 27: w16 = t2 <= t1 + t2 = w15 + w16 */ - bn.addm w16, w15, w16 - - /* 28: w12 = Y3 <= Y3 - t2 = w19 - w16 */ - bn.subm w12, w19, w16 - - /* 29: w12 = Y3 <= Y3 - t0 = w12 - w14 */ - bn.subm w12, w12, w14 - - /* 30: w15 = t1 <= Y3 + Y3 = w12 + w12 */ - bn.addm w15, w12, w12 - - /* 31: w12 = Y3 <= t1 + Y3 = w15 + w12*/ - bn.addm w12, w15, w12 - - /* 32: w15 = t1 <= t0 + t0 = w14 + w14 */ - bn.addm w15, w14, w14 - - /* 33: w14 = t0 <= t1 + t0 = w15 + w14 */ - bn.addm w14, w15, w14 - - /* 34: w14 = t0 <= t0 - t2 = w14 - w16 */ - bn.subm w14, w14, w16 - - /* 35: w15 = t1 <= t4 * Y3 = w18 * w12 */ - bn.mov w24, w18 - bn.mov w25, w12 - jal x1, mod_mul_256x256 - bn.mov w15, w19 - - /* 36: w16 = t2 <= t0 * Y3 = w14 * w12 */ - bn.mov w24, w14 - bn.mov w25, w12 - jal x1, mod_mul_256x256 - bn.mov w16, w19 - - /* 37: w12 = Y3 <= X3 * Z3 = w11 * w13 */ - bn.mov w24, w11 - bn.mov w25, w13 - jal x1, mod_mul_256x256 - - /* 38: w12 = Y3 <= Y3 + t2 = w19 + w16 */ - bn.addm w12, w19, w16 - - /* 39: w19 = X3 <= t3 * X3 = w17 * w11 */ - bn.mov w24, w17 - bn.mov w25, w11 - jal x1, mod_mul_256x256 - - /* 40: w11 = X3 <= X3 - t1 = w19 - w15 */ - bn.subm w11, w19, w15 - - /* 41: w13 = Z3 <= t4 * Z3 = w18 * w13 */ - bn.mov w24, w18 - bn.mov w25, w13 - jal x1, mod_mul_256x256 - bn.mov w13, w19 - - /* 42: w19 = t1 <= t3 * t0 = w17 * w14 */ - bn.mov w24, w17 - bn.mov w25, w14 - jal x1, mod_mul_256x256 - - /* 43: w13 = Z3 <= Z3 + t1 = w13 + w19 */ - bn.addm w13, w13, w19 - - ret - - -/** - * Convert projective coordinates of a P-256 curve point to affine coordinates - * - * returns P = (x_a, y_a) = (x/z mod p, y/z mod p) - * with P being a valid P-256 curve point - * x_a and y_a being the affine coordinates of said curve point - * x, y and z being a set of projective coordinates of said point - * and p being the modulus of the P-256 underlying finite field. - * - * This routine computes the affine coordinates for a set of projective - * coordinates of a valid P-256 curve point. The routine performs the required - * divisions by computing the multiplicative modular inverse of the - * projective z-coordinate in the underlying finite field of the P-256 curve. - * For inverse computation Fermat's little theorem is used, i.e. - * we compute z^-1 = z^(p-2) mod p. - * For exponentiation a 27 step addition chain is used. - * This addition chain is (presumably?) the shortest addition chain known as - * of today for the exponent exp = p - 2 = 2^256 - 2^224 + 2^6 + 2^64 - 1 - 2. - * Origin of the chain can not fully be traced. [1] attributes it to a specific - * patch to OpenSLL. The same addition chain is used in the OpenSSL and - * BoringSSL crypto libraries. - * This routine runs in constant time. - * - * [1] https://doi.org/10.1007/s13389-014-0090-x - * https://eprint.iacr.org/2013/816.pdf - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * @param[in] w8: x, x-coordinate of curve point (projective) - * @param[in] w9: y, y-coordinate of curve point (projective) - * @param[in] w10: z, z-coordinate of curve point (projective) - * @param[in] w29: p, modulus, 2^256 > p > 2^255. - * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb - * of u which is always 1 for the allowed range. - * @param[in] MOD: p, modulus of the finite field of P-256 - * @param[out] w11: x_a, x-coordinate of curve point (affine) - * @param[out] w12: y_a, y-coordinate of curve point (affine) - * - * clobbered registers: w10 to w19, w24, w25 - * clobbered flag groups: FG0 - */ -proj_to_affine: - - /* 1: exp = 0x1 */ - bn.addm w10, w10, w31 - - /* 2: exp = 0x2 = 2*0x1 */ - bn.mov w24, w10 - bn.mov w25, w10 - jal x1, mod_mul_256x256 - - /* 3: exp = 0x3 = 0x2+0x1 */ - bn.mov w24, w19 - bn.mov w25, w10 - jal x1, mod_mul_256x256 - bn.mov w12, w19 - - /* 4: exp = 0x6 = 2*0x3 */ - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 5: exp = 0xc = 2*0x6 */ - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 6: exp = 0xf = 0xc+0x3 */ - bn.mov w24, w19 - bn.mov w25, w12 - jal x1, mod_mul_256x256 - bn.mov w13, w19 - - /* 7: exp = 0xf0 = 16*0xf */ - loopi 4, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 8: exp = 0xff = 0xf0+0xf */ - bn.mov w24, w19 - bn.mov w25, w13 - jal x1, mod_mul_256x256 - bn.mov w14, w19 - - /* 9: exp = 0xff00 = 256*0xff */ - loopi 8, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 10: exp = 0xffff = 0xff00+0xff */ - bn.mov w24, w19 - bn.mov w25, w14 - jal x1, mod_mul_256x256 - bn.mov w15, w19 - - /* 11: exp = 0xffff0000 = 2^16*0xffff */ - loopi 16, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 12: exp = 0xffffffff = 0xffff0000+0xffff */ - bn.mov w24, w19 - bn.mov w25, w15 - jal x1, mod_mul_256x256 - bn.mov w16, w19 - - /* 13: exp = 0xffffffff00000000 = 2^32*0xffffffff */ - loopi 32, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - bn.mov w17, w19 - - /* 14: exp = 0xffffffff00000001 = 0xffffffff00000000+0x1 */ - bn.mov w24, w10 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 15: exp = - 0xffffffff00000001000000000000000000000000000000000000000000000000 - = 2^192*0xffffffff00000001 */ - loopi 192, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - bn.mov w18, w19 - - /* 16: exp = 0xffffffffffffffff = 0xffffffff00000000+0xffffffff */ - bn.mov w24, w17 - bn.mov w25, w16 - jal x1, mod_mul_256x256 - - /* 17: exp = 0xffffffffffffffff0000 = 2^16*0xffffffffffffffff */ - loopi 16, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 18: exp = 0xffffffffffffffffffff = 0xffffffffffffffff0000+0xffff */ - bn.mov w24, w15 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 19: exp = 0xffffffffffffffffffff00 = 256*0xffffffffffffffffffff */ - loopi 8, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 20: exp = 0xffffffffffffffffffffff = 0xffffffffffffffffffff00+0xff */ - bn.mov w24, w14 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 21: exp = 0xffffffffffffffffffffff0 = 16*0xffffffffffffffffffffff */ - loopi 4, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 22: exp = 0xfffffffffffffffffffffff = 0xffffffffffffffffffffff0+0xf */ - bn.mov w24, w13 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 23: exp = 0x3ffffffffffffffffffffffc = 4*0xfffffffffffffffffffffff */ - loopi 2, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 24: exp = 0x3fffffffffffffffffffffff = 0x3ffffffffffffffffffffffc+0x3 */ - bn.mov w24, w12 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 25: exp = 0xfffffffffffffffffffffffc = 4*0x3fffffffffffffffffffffff */ - loopi 2, 4 - bn.mov w24, w19 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - nop - - /* 26: exp = 0xfffffffffffffffffffffffd = 0xfffffffffffffffffffffffc+0x1 */ - bn.mov w24, w10 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* 27: exp = p-2 - = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd - = 0xfffffffffffffffffffffffd - + 0xffffffff00000001000000000000000000000000000000000000000000000000 - w14 = z^exp = z^(p-2) = z^-1 mod p */ - bn.mov w24, w19 - bn.mov w25, w18 - jal x1, mod_mul_256x256 - bn.mov w14, w19 - - /* convert x-coordinate to affine - w11 = x_a = x/z = x * z^(-1) = w8 * w14 */ - bn.mov w24, w8 - bn.mov w25, w14 - jal x1, mod_mul_256x256 - bn.mov w11, w19 - - /* convert y-coordinate to affine - w12 = y_a = y/z = y * z^(-1) = w9 * w14 */ - bn.mov w24, w9 - bn.mov w25, w14 - jal x1, mod_mul_256x256 - bn.mov w12, w19 - - ret - - -/** - * Variable time modular multiplicative inverse computation - * - * returns x_inv = x^-1 mod m - * - * This routine computes the modular multiplicative inverse for any x < m in - * the field GF(m). - * For inverse computation Fermat's little theorem is used, i.e. - * we compute x^-1 = x^(m-2) mod m. - * For exponentiation we use a standard, variable time (!) square and multiply - * algorithm. - * - * @param[in] w0: x, a 256 bit operand with x < m - * @param[in] w29: m, modulus, 2^256 > m > 2^255. - * @param[in] w28: u, lower 256 bit of pre-computed Barrett constant - * @param[in] w31, all-zero - * @param[in] MOD: m, modulus - * @param[out] w1: x_inv, modular multiplicative inverse - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: w1, w2, w3, w19, w24, w25 - * clobbered flag groups: FG0 - */ -mod_inv: - - /* subtract 2 from modulus for Fermat's little theorem - w2 = MOD - 2 = m - 2 */ - bn.wsrr w2, 0 - bn.subi w2, w2, 2 - - /* init square and multiply: w1 = 1 */ - bn.addi w1, w31, 1 - - /* square and multiply loop */ - loopi 256, 14 - - /* square: w3 = w19 = w24*w25 = w1^2 mod m */ - bn.mov w24, w1 - bn.mov w25, w1 - jal x1, mod_mul_256x256 - bn.mov w3, w19 - - /* shift MSB into carry flag - w2 = 2*w2 = w2 << 1 */ - bn.add w2, w2, w2 - - /* skip multiplication if C flag not set */ - bn.sel w1, w1, w3, C - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, nomul - - /* multiply: w1 = w19 = w24*w25 = w3*w0 mod m */ - bn.mov w24, w3 - bn.mov w25, w0 - jal x1, mod_mul_256x256 - bn.mov w1, w19 - - nomul: - nop - - ret - - -/** - * Fetch curve point from dmem and randomize z-coordinate - * - * returns P = (x, y, z) = (x_a*z, y_a*z, z) - * with P being a valid P-256 curve point in projective coordinates - * x_a and y_a being the affine coordinates as fetched from dmem - * z being a randomized z-coordinate - * - * This routines fetches the affine x- and y-coordinates of a curve point from - * dmem and computes a valid set of projective coordinates. The z-coordinate is - * randomized and x and y are scaled appropriately. - * This routine runs in constant time. - * - * @param[in] x10: constant 24 - * @param[in] x21: dptr_x, pointer to dmem location containing affine - * x-coordinate of input point - * @param[in] x22: dptr_y, pointer to dmem location containing affine - * y-coordinate of input point - * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 - * @param[in] w29: p, modulus of P-256 underlying finite field - * @param[in] w31: all-zero - * @param[in] MOD: p, modulus of P-256 underlying finite field - * @param[out] w26: z, random projective z-coordinate - * @param[out] w6: x, projective x-coordinate - * @param[out] w7: y, projective y-coordinate - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the scaled projective y-coordinate. - * - * clobbered registers: w2, w6, w7, w19 to w26 - * clobbered flag groups: FG0 - */ -fetch_proj_randomize: - - /* get random number */ - bn.wsrr w2, 1 - - /* reduce random number - w26 = z <= w2 mod p */ - bn.addm w26, w2, w31 - - /* fetch x-coordinate from dmem - w24 = x_a <= dmem[x22] = dmem[dptr_x] */ - bn.lid x10, 0(x21) - - /* scale x-coordinate - w6 = x <= w24*w26 = x_a*z mod p */ - bn.mov w25, w26 - jal x1, mod_mul_256x256 - bn.mov w6, w19 - - /* fetch y-coordinate from dmem - w24 = y_a <= dmem[x22] = dmem[dptr_y] */ - bn.lid x10, 0(x22) - - /* scale y-coordinate - w7 = y <= w24*w26 = y_a*z mod p */ - bn.mov w25, w26 - jal x1, mod_mul_256x256 - bn.mov w7, w19 - - ret - - -/** - * P-256 point doubling in projective space - * - * returns R = (x_r, y_r, z_r) <= 2*P = 2*(x_p, y_p, z_p) - * with R, P being valid P-256 curve points - * - * This routines doubles a given P-256 curve point in projective coordinate. - * Internally this routine makes use of the point addition routine and - * adds the point to itself. - * This routine runs in constant time. - * - * @param[in] w8: x_p, x-coordinate of input point - * @param[in] w9: y_p, y-coordinate of input point - * @param[in] w10: z_p, z-coordinate of input point - * @param[in] w27: b, curve domain parameter - * @param[in] w29: p, p, modulus of P-256 underlying finite field - * @param[in] w28: u, u, lower 256 bit of Barrett constant for curve P-256 - * @param[in] w31: all-zero. - * @param[in] MOD: p, modulus of P-256 underlying finite field - * @param[out] w11: x_r, x-coordinate of resulting point - * @param[out] w12: y_r, y-coordinate of resulting point - * @param[out] w13: z_r, z-coordinate of resulting point - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: w11 to w25 - * clobbered flag groups: FG0 - */ -proj_double: - - /* Q = (x_q, y_q, z_q) = (w11, w12, w13) <= P = (x_p, y_p, z_p) - = (w8, w9, w10) */ - bn.mov w11, w8 - bn.mov w12, w9 - bn.mov w13, w10 - - /* R = (x_r, y_r, z_r) = (w11, w12, w13) = P+Q - = (w8, w9, w10) + (w11, w12, w13) = (x_p, y_p, z_p) + (x_q, y_q, z_q) */ - jal x1, proj_add - - ret - - -/** - * P-256 scalar point multiplication in affine space - * - * returns R = k*P = k*(x_p, y_p) - * with R, P being valid P-256 curve points in affine coordinates - * k being a 256 bit scalar - * - * This routine performs scalar multiplication based on the group laws - * of Weierstrass curves. - * A constant time double-and-add algorithm (sometimes referred to as - * double-and-add-always) is used. - * Due to the P-256 optimized implementations of the called routines for - * point addition and doubling, this routine is limited to P-256. - * The routine makes use of blinding by additive splitting the - * exponent/scalar k into a random number (rnd) and rnd-k. The double-and-add - * loop operates on both shares in parallel applying Shamir's trick. - * - * @param[in] x21: dptr_x, pointer to affine x-coordinate in dmem - * @param[in] x22: dptr_y, pointer to affine y-coordinate in dmem - * @param[in] w0: k, scalar for multiplication - * @param[in] w1: rnd, blinding parameter - * @param[in] w27: b, curve domain parameter - * @param[in] w31: all-zero - * @param[in] MOD: p, modulus, 2^256 > p > 2^255. - * @param[out] w11: x_r, affine x-coordinate of resulting point - * @param[out] w12: y_r, affine y-coordinate of resulting point - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x10, w0 to w26 - * clobbered flag groups: FG0 - */ -scalar_mult_int: - - /* load order of base point G of P-256 - w29 <= n = dmem[p256_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - - /* store n to MOD WSR */ - bn.wsrw 0, w29 - - /* 1st share (reduced rnd) - rnd = w1 <= rnd mod n */ - bn.addm w1, w1, w31 - - /* 2nd share (k-rnd) - w0 = w0 - w1 = k - rnd mod n */ - bn.subm w0, w0, w1 - - /* load field modulus p from dmem - w29 <= p = dmem[p256_p] */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - - /* store modulus to MOD WSR */ - bn.wsrw 0, w29 - - /* load lower 256 bit of Barrett constant u for modulus p from dmem - w28 <= u = dmem[p256_u_p] */ - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) - - /* load domain parameter b from dmem - w27 <= b = dmem[p256_b] */ - li x2, 27 - la x3, p256_b - bn.lid x2, 0(x3) - - /* get randomized projective coodinates of curve point - P = (x_p, y_p, z_p) = (w8, w9, w10) = (w6, w7, w26) = - (x*z mod p, y*z mod p, z) */ - li x10, 24 - jal x1, fetch_proj_randomize - bn.mov w8, w6 - bn.mov w9, w7 - bn.mov w10, w26 - - /* Init 2P, this will be used for the addition part in the double-and-add - loop when the bit at the current index is 1 for both shares of the scalar. - 2P = (w3, w4, w5) <= (w11, w12, w13) <= 2*(w8, w9, w10) = 2*P */ - jal x1, proj_double - bn.mov w3, w11 - bn.mov w4, w12 - bn.mov w5, w13 - - /* init double-and-add with point in infinity - Q = (w8, w9, w10) <= (0, 1, 0) */ - bn.mov w8, w31 - bn.addi w9, w31, 1 - bn.mov w10, w31 - - /* double-and-add loop with decreasing index */ - loopi 256, 32 - - /* double point Q - Q = (w11, w12, w13) <= 2*(w8, w9, w10) = 2*Q */ - jal x1, proj_double - - /* re-fetch and randomize P again - P = (w6, w7, w26) */ - jal x1, fetch_proj_randomize - - /* probe if MSb of either of the two scalars (rnd or k-rnd) but not both - is 1. - If only one MSb is set, select P for addition - If both MSbs are set, select 2P for addition - (If neither MSB is set, also 2P will be selected but this will be - discarded late) */ - bn.xor w8, w0, w1 - - /* P = (w8, w9, w10) - <= (w0[255] xor w1[256])?P=(w6, w7, w26):2P=(w3, w4, w5) */ - bn.sel w8, w6, w3, M - bn.sel w9, w7, w4, M - bn.sel w10, w26, w5, M - - /* save doubling result to survive follow-up subroutine call - Q = (w2, w6, w7) <= (w11, w12, w13) */ - bn.mov w2, w11 - bn.mov w6, w12 - bn.mov w7, w13 - - /* add points - Q+P = (w11, w12, w13) <= (w11, w12, w13) + (w8, w9, w10) */ - jal x1, proj_add - - /* probe if MSb of either one or both of the two - scalars (rnd or k-rnd) is 1.*/ - bn.or w8, w0, w1 - - /* select doubling result (Q) or addition result (Q+P) - Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w2, w6, w7) */ - bn.sel w8, w11, w2, M - bn.sel w9, w12, w6, M - bn.sel w10, w13, w7, M - - /* rotate both scalars left 1 bit */ - bn.rshi w0, w0, w0 >> 255 - bn.rshi w1, w1, w1 >> 255 - - /* init regs with random numbers */ - bn.wsrr w11, 1 - bn.wsrr w12, 1 - bn.wsrr w13, 1 - - /* get a fresh random number and scale the coordinates of - 2P = (w3, w4, w5) (scaling each projective coordinate with same - factor results in same point) */ - bn.wsrr w2, 1 - - /* w3 = w3 * w2 */ - bn.mov w24, w3 - bn.mov w25, w2 - jal x1, mod_mul_256x256 - bn.mov w3, w19 - - /* w4 = w4 * w2 */ - bn.mov w24, w4 - bn.mov w25, w2 - jal x1, mod_mul_256x256 - bn.mov w4, w19 - - /* w5 = w5 * w2 */ - bn.mov w24, w5 - bn.mov w25, w2 - jal x1, mod_mul_256x256 - bn.mov w5, w19 - - /* convert back to affine coordinates - R = (x_a, y_a) = (w11, w12) */ - jal x1, proj_to_affine - - ret - - -/** - * P-256 ECDSA signature generation - * - * returns the signature as the pair r, s with - * r = x_1 mod n - * and s = k^(-1)(msg + r*d) mod n - * with x_1 being the affine x-coordinate of the curve point k*G, - * where G is the curve's base point. - * k being a supplied secret random number, - * n being the order of the base point G of P-256, - * msg being the msg to be signed, - * d being the private key. - * - * This routine runs in constant time. - * - * Note: Some versions of the ECDSA spec suggest that msg must be reduced - * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it - * is sufficient that msg < 2^256, because the message is multiplied with - * k^(-1) mod n, and our Barrett multiplication implementation accepts any - * operands a and b such that a * b < 2^256 * p and fully reduces the result. - * - * @param[in] dmem[0]: dptr_k, pointer to a 256 bit random secret in dmem - * @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing random - * number for blinding - * @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem - * @param[in] dmem[12]: dptr_r, pointer to dmem location where s component - * of signature will be placed - * @param[in] dmem[16]: dptr_s, pointer to dmem location where r component - * of signature will be placed - * @param[in] dmem[28]: dptr_d, pointer to private key d in dmem - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x16 to x23, w0 to w26 - * clobbered flag groups: FG0 - */ -p256_sign: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load dmem pointer to secret random scalar k: x16 <= dptr_k = dmem[0] */ - la x16, dptr_k - lw x16, 0(x16) - - /* load dmem pointer to random number for blinding rnd in dmem: - x17 <= dptr_rnd = dmem[4] */ - la x17, dptr_rnd - lw x17, 0(x17) - - /* load dmem pointer to message msg in dmem: x18 <= dptr_msg = dmem[8] */ - la x18, dptr_msg - lw x18, 0(x18) - - /* load dmem pointer to signature r in dmem: x19 <= dptr_r = dmem[12] */ - la x19, dptr_r - lw x19, 0(x19) - - /* load dmem pointer to signature s in dmem: x20 <= dptr_s = dmem[16] */ - la x20, dptr_s - lw x20, 0(x20) - - /* load dmem pointer to private key d in dmem: x23 <= d = dmem[28] */ - la x23, dptr_d - lw x23, 0(x23) - - /* load secret random scalar k from dmem: w0 = dmem[dptr_k] */ - li x2, 0 - bn.lid x2, 0(x16) - - /* load random number for blinding from dmem: w1 = dmem[dptr_rnd] */ - li x2, 1 - bn.lid x2, 0(x17) - - /* scalar multiplication with base point - (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */ - la x21, p256_gx - la x22, p256_gy - jal x1, scalar_mult_int - - /* setup modulus n (curve order) and Barrett constant - MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_n - bn.lid x2, 0(x3) - - /* re-load secret random number k from dmem: w0 <= k = dmem[dptr_k] */ - li x2, 0 - bn.lid x2, 0(x16) - - /* modular multiplicative inverse of k - w1 <= k^-1 mod n */ - jal x1, mod_inv - - /* w19 = k^-1*d mod n; w24 = d = dmem[dptr_d] */ - li x2, 24 - bn.lid x2, 0(x23) - bn.mov w25, w1 - jal x1, mod_mul_256x256 - - /* w24 = r <= w11 mod n */ - bn.addm w24, w11, w31 - - /* store r of signature in dmem: dmem[dptr_r] <= r = w24 */ - li x2, 24 - bn.sid x2, 0(x19) - - /* w0 = w19 <= w24*w25 = w24*w19 = r*k^-1*d mod n */ - bn.mov w25, w19 - jal x1, mod_mul_256x256 - bn.mov w0, w19 - - /* load message from dmem: w24 = msg <= dmem[dptr_msg] = dmem[x18] */ - li x2, 24 - bn.lid x2, 0(x18) - - /* w19 = k^-1*msg <= w25*w24 = w1*w24 mod n */ - bn.mov w25, w1 - jal x1, mod_mul_256x256 - - /* w0 = s <= w19 + w0 = k^-1*msg + r*k^-1*d mod n */ - bn.addm w0, w19, w0 - - /* store s of signature in dmem: dmem[dptr_s] <= s = w0 */ - li x2, 0 - bn.sid x2, 0(x20) - - ret - - -/** - * P-256 scalar multiplication with base point G - * - * returns R = d*G = d*(x_g, y_g) - * with R and G being valid P-256 curve points in affine coordinates, - * furthermore G being the curves base point, - * d being a 256 bit scalar - * - * Performs a scalar multiplication of a scalar with the base point G of curve - * P-256. - * This routine runs in constant time. - * - * @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing random - * number for blinding - * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem - * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem - * @param[in] dmem[28]: dptr_d, pointer to location in dmem containing - * scalar d - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w26 - * clobbered flag groups: FG0 - */ -p256_base_mult: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load scalar d: w0 <= d = dmem[dptr_d] */ - la x16, dptr_d - lw x16, 0(x16) - - /* load dmem pointer to random number for blinding rnd in dmem: - x17 <= dptr_rnd = dmem[4] */ - la x17, dptr_rnd - lw x17, 0(x17) - - /* set dmem pointers to base point coordinates */ - la x21, p256_gx - la x22, p256_gy - - /* load private key d from dmem: w0 = dmem[dptr_d] */ - li x2, 0 - bn.lid x2, 0(x16) - - /* load random number for blinding from dmem: w1 = dmem[dptr_rnd] */ - li x2, 1 - bn.lid x2, 0(x17) - - /* call internal scalar multiplication routine - R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */ - jal x1, scalar_mult_int - - /* set dmem pointer to point x-coordinate */ - la x21, dptr_x - lw x21, 0(x21) - - /* set dmem pointer to point y-coordinate */ - la x22, dptr_y - lw x22, 0(x22) - - /* store result (affine coordinates) in dmem - dmem[x21] = dmem[dptr_x] <= x_a = w11 - dmem[x22] = dmem[dptr_y] <= y_a = w12 */ - li x2, 11 - bn.sid x2++, 0(x21) - bn.sid x2, 0(x22) - - ret - - -/** - * Variable time modular multiplicative inverse computation - * - * Returns c <= a^(-1) mod m - * with a being a bigint of length 256 bit with a < m - * m being the modulus with a length of 256 bit - * c being a 256-bit result - * - * This routine implements the computation of the modular multiplicative - * inverse based on the binary GCD or Stein's algorithm. - * The implemented variant is based on the - * "right-shift binary extended GCD" as it is described in section 3.1 of [1] - * (Algorithm 1). - * [1] https://doi.org/10.1155/ES/2006/32192 - * - * Note that this is a variable time implementation. I.e. this routine will - * show a data dependent timing and execution profile. Only use in situations - * where a full white-box environment is acceptable. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] w0: a, operand - * @param[in] MOD: m, modulus - * @param[in] w31: all-zero - * @param[out] w1: result c - * - * clobbered registers: x2, w2, w3, w4, w7 - * clobbered flag groups: FG0 - */ -mod_inv_var: - - /* w2 = r = 0 */ - bn.mov w2, w31 - - /* w3 = s = 1 */ - bn.addi w3, w31, 1 - - /* w4 = u = MOD */ - bn.wsrr w4, 0 - bn.wsrr w7, 0 - - /* w5 = v = w0 */ - bn.mov w5, w0 - - ebgcd_loop: - /* test if u is odd */ - bn.or w4, w4, w4 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_u_odd - - /* u is even: */ - /* w4 = u <= u/2 = w4 >> 1 */ - bn.rshi w4, w31, w4 >> 1 - - /* test if r is odd */ - bn.or w2, w2, w2 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_r_odd - - /* r is even: */ - /* w2 = r <= r/2 = w2 >> 1 */ - bn.rshi w2, w31, w2 >> 1 - jal x0, ebgcd_loop - - ebgcd_r_odd: - /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */ - bn.add w2, w7, w2 - bn.addc w6, w31, w31 - bn.rshi w2, w6, w2 >> 1 - jal x0, ebgcd_loop - - ebgcd_u_odd: - /* test if v is odd */ - bn.or w5, w5, w5 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_uv_odd - - /* v is even: */ - /* w5 = v <= v/2 = w5 >> 1 */ - bn.rshi w5, w31, w5 >> 1 - - /* test if s is odd */ - bn.or w3, w3, w3 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_s_odd - - /* s is even: */ - /* w3 = s <= s/2 = w3 >> 1 */ - bn.rshi w3, w31, w3 >> 1 - jal x0, ebgcd_loop - - ebgcd_s_odd: - /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */ - bn.add w3, w7, w3 - bn.addc w6, w31, w31 - bn.rshi w3, w6, w3 >> 1 - jal x0, ebgcd_loop - - ebgcd_uv_odd: - /* test if v >= u */ - bn.cmp w5, w4 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, ebgcd_v_gte_u - - /* u > v: */ - /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */ - bn.subm w2, w2, w3 - - /* w4 = u <= u - v = w4 - w5 */ - bn.sub w4, w4, w5 - jal x0, ebgcd_loop - - ebgcd_v_gte_u: - /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */ - bn.subm w3, w3, w2 - - /* w5 = v <= v - u = w5 - w4 */ - bn.sub w5, w5, w4 - - /* if v > 0 go back to start of loop */ - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - beq x2, x0, ebgcd_loop - - /* v <= 0: */ - /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */ - bn.addm w1, w2, w31 - - ret - - -/** - * P-256 ECDSA signature verification - * - * returns the affine x-coordinate of - * (x1, y1) = u1*G + u2*Q - * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n - * with G being the curve's base point, - * z being the message - * r, s being the signature - * Q being the public key. - * - * The routine computes the x1 coordinate and places it in dmem. x1 will be - * reduced (mod n), however, the final comparison has to be performed on the - * host side. The signature is valid if x1 == r. - * This routine runs in variable time. - * - * @param[in] dmem[8]: dptr_msg, pointer to the message to be verified in dmem - * @param[in] dmem[12]: dptr_r, pointer to s of signature in dmem - * @param[in] dmem[16]: dptr_s, pointer to r of signature in dmem - * @param[in] dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem - * @param[in] dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem - * @param[in] dmem[32]: dptr_x_r, pointer to dmem location where the reduced - * affine x_r-coordinate will be stored (aka x_1) - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25 - * clobbered flag groups: FG0 - */ -p256_verify: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load domain parameter b from dmem - w27 <= b = dmem[p256_b] */ - li x2, 27 - la x3, p256_b - bn.lid x2, 0(x3) - - /* load dmem pointer to x_r (result) from dmem: x17 <= dptr_x_r = dmem[32] */ - la x17, dptr_x_r - lw x17, 0(x17) - - /* load dmem pointer to message msg in dmem: x18 <= dptr_msg = dmem[8] */ - la x18, dptr_msg - lw x18, 0(x18) - - /* load dmem pointer to signature r in dmem: x19 <= dptr_r = dmem[12] */ - la x19, dptr_r - lw x19, 0(x19) - - /* load dmem pointer to signature s in dmem: x20 <= dptr_s = dmem[16] */ - la x20, dptr_s - lw x20, 0(x20) - - /* load dmem pointer to affine x-coordinate of public key from dmem: - x21 <= dptr_x = dmem[20] */ - la x21, dptr_x - lw x21, 0(x21) - - /* load dmem pointer to affine y-coordinate of public key from dmem: - x22 <= dptr_y = dmem[24] */ - la x22, dptr_y - lw x22, 0(x22) - - la x23, p256_gx - la x24, p256_gy - - /* load r of signature from dmem: w24 = r = dmem[dptr_r] */ - li x2, 11 - bn.lid x2, 0(x19) - - /* setup modulus n (curve order) and Barrett constant - MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_n - bn.lid x2, 0(x3) - - /* load s of signature from dmem: w0 = s = dmem[dptr_s] */ - bn.lid x0, 0(x20) - - /* goto 'fail' if w0 == w31 <=> s == 0 */ - bn.cmp w0, w31 - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - bne x2, x0, fail - - /* goto 'fail' if w0 >= w29 <=> s >= n */ - bn.cmp w0, w29 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, fail - - /* w1 = s^-1 mod n */ - jal x1, mod_inv_var - - /* load r of signature from dmem: w24 = r = dmem[dptr_r] */ - li x2, 24 - bn.lid x2, 0(x19) - - /* goto 'fail' if w24 == w31 <=> r == 0 */ - bn.cmp w24, w31 - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - bne x2, x0, fail - - /* goto 'fail' if w0 >= w29 <=> r >= n */ - bn.cmp w24, w29 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, fail - - /* w25 = s^-1 = w1 */ - bn.mov w25, w1 - - /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */ - jal x1, mod_mul_256x256 - bn.mov w0, w19 - - /* load message, w24 = msg = dmem[dptr_msg] */ - li x2, 24 - bn.lid x2, 0(x18) - - /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */ - bn.mov w25, w1 - jal x1, mod_mul_256x256 - bn.mov w1, w19 - - /* setup modulus p and Barrett constant */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) - - /* load public key Q from dmem and use in projective form (set z to 1) - Q = (w11, w12, w13) = (dmem[dptr_x], dmem[dptr_y], 1) */ - li x2, 11 - bn.lid x2++, 0(x21) - bn.lid x2, 0(x22) - bn.addi w13, w31, 1 - - /* load base point G and use in projective form (set z to 1) - G = (w8, w9, w10) = (x_g, y_g, 1) */ - li x13, 8 - li x14, 9 - bn.lid x13, 0(x23) - bn.lid x14, 0(x24) - bn.addi w10, w31, 1 - - /* The rest of the routine implements a variable time double-and-add - algorithm. For the signature verification we need to compute the point - C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single - double-and-add routine by using Shamir's Trick. */ - - /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */ - jal x1, proj_add - bn.mov w3, w11 - bn.mov w4, w12 - bn.mov w5, w13 - - /* w2 = u_2 & u_0 = w0 & w1*/ - bn.and w2, w0, w1 - - /* init double and add algorithm with (0, 1, 0) */ - bn.mov w11, w31 - bn.addi w12, w31, 1 - bn.mov w13, w31 - - /* main loop with dicreasing index i (i=255 downto 0) */ - loopi 256, 31 - - /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */ - bn.mov w8, w11 - bn.mov w9, w12 - bn.mov w10, w13 - jal x1, proj_add - - /* if either u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */ - bn.add w2, w2, w2 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_both - - /* both bits at current index (u1[i] and u2[i]) are set: - do C <= C + (P + Q) and jump to end */ - bn.mov w8, w3 - bn.mov w9, w4 - bn.mov w10, w5 - jal x1, proj_add - jal x0, no_q - - /* either u1[i] or u2[i] is set, but not both */ - no_both: - - /* if u2[i] is not set jump to 'no_g' */ - bn.add w6, w0, w0 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_g - - /* u2[i] is set: do C <= C + Q */ - bn.lid x13, 0(x21) - bn.lid x14, 0(x22) - bn.addi w10, w31, 1 - jal x1, proj_add - - no_g: - /* if u1[i] is not set jump to 'no_q' */ - bn.add w6, w1, w1 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_q - - /* load base point x-coordinate - w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */ - bn.lid x13, 0(x23) - bn.lid x14, 0(x24) - - /* u1[i] is set: do C <= C + G */ - bn.addi w10, w31, 1 - jal x1, proj_add - - no_q: - /* left shift w0 and w1 to decrease index */ - bn.add w0, w0, w0 - bn.add w1, w1, w1 - - /* compute inverse of z-coordinate: w1 = z_c^-1 mod p */ - bn.mov w0, w13 - jal x1, mod_inv_var - - /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ - bn.mov w24, w1 - bn.mov w25, w11 - jal x1, mod_mul_256x256 - - /* final reduction: w24 = x1 <= x1 mod n */ - la x3, p256_n - bn.lid x0, 0(x3) - bn.wsrw 0, w0 - bn.subm w24, w19, w31 - - fail: - /* store affine x-coordinate in dmem: dmem[dptr_x_r] = w24 = x_r */ - li x2, 24 - bn.sid x2, 0(x17) - - ret - - -/** - * Externally callable wrapper for P-256 scalar point multiplication - * - * returns R = k*P = k*(x_p, y_p, z_p) - * with R, P being valid P-256 curve points in projective form, - * k being a 256 bit scalar. - * - * Sets up context and calls internal scalar multiplication routine. - * This routine runs in constant time. - * - * @param[in] dmem[0]: dK, pointer to location in dmem containing scalar k - * @param[in] dmem[4]: dRnd, pointer to location in dmem containing random - * number for blinding - * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem - * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25 - * clobbered flag groups: FG0 - */ -p256_scalar_mult: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load dmem pointer to scalar k: x16 <= dptr_k = dmem[0] */ - la x16, dptr_k - lw x16, 0(x16) - - /* load dmem pointer to random number for blinding rnd in dmem: - x17 <= dptr_rnd = dmem[4] */ - la x17, dptr_rnd - lw x17, 0(x17) - - /* set dmem pointer to point x-coordinate */ - la x21, dptr_x - lw x21, 0(x21) - - /* set dmem pointer to point y-coordinate */ - la x22, dptr_y - lw x22, 0(x22) - - /* load private key d from dmem: w0 = dmem[dptr_d] */ - li x2, 0 - bn.lid x2, 0(x16) - - /* load random number for blinding from dmem: w1 = dmem[dptr_rnd] */ - li x2, 1 - bn.lid x2, 0(x17) - - /* call internal scalar multiplication routine - R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */ - jal x1, scalar_mult_int - - /* store result (affine coordinates) in dmem - dmem[x21] = dmem[dptr_x] <= x_a = w11 - dmem[x22] = dmem[dptr_y] <= y_a = w12 */ - li x2, 11 - bn.sid x2++, 0(x21) - bn.sid x2, 0(x22) - - ret - -.section .data - -/* pointer to k (dptr_k) */ -.globl dptr_k -.balign 4 -dptr_k: - .zero 4 - -/* pointer to rnd (dptr_rnd) */ -.globl dptr_rnd -.balign 4 -dptr_rnd: - .zero 4 - -/* pointer to msg (dptr_msg) */ -.globl dptr_msg -.balign 4 -dptr_msg: - .zero 4 - -/* pointer to R (dptr_r) */ -.globl dptr_r -.balign 4 -dptr_r: - .zero 4 - -/* pointer to S (dptr_s) */ -.globl dptr_s -.balign 4 -dptr_s: - .zero 4 - -/* pointer to X (dptr_x) */ -.globl dptr_x -.balign 4 -dptr_x: - .zero 4 - -/* pointer to Y (dptr_y) */ -.globl dptr_y -.balign 4 -dptr_y: - .zero 4 - -/* pointer to D (dptr_d) */ -.globl dptr_d -.balign 4 -dptr_d: - .zero 4 - -/* pointer to verification result x_r aka x_1 (dptr_x_r) */ -.globl dptr_x_r -.balign 4 -dptr_x_r: - .zero 4 - -/* P-256 domain parameter b */ -.globl p256_b -.balign 32 -p256_b: - .word 0x27d2604b - .word 0x3bce3c3e - .word 0xcc53b0f6 - .word 0x651d06b0 - .word 0x769886bc - .word 0xb3ebbd55 - .word 0xaa3a93e7 - .word 0x5ac635d8 - -/* P-256 domain parameter p (modulus) */ -.globl p256_p -.balign 32 -p256_p: - .word 0xffffffff - .word 0xffffffff - .word 0xffffffff - .word 0x00000000 - .word 0x00000000 - .word 0x00000000 - .word 0x00000001 - .word 0xffffffff - -/* Barrett constant u for modulus p */ -.globl p256_u_p -.balign 32 -p256_u_p: - .word 0x00000003 - .word 0x00000000 - .word 0xffffffff - .word 0xfffffffe - .word 0xfffffffe - .word 0xfffffffe - .word 0xffffffff - .word 0x00000000 - -/* P-256 domain parameter n (order of base point) */ -.globl p256_n -.balign 32 -p256_n: - .word 0xfc632551 - .word 0xf3b9cac2 - .word 0xa7179e84 - .word 0xbce6faad - .word 0xffffffff - .word 0xffffffff - .word 0x00000000 - .word 0xffffffff - -/* Barrett constant u for n */ -.globl p256_u_n -.balign 32 -p256_u_n: - .word 0xeedf9bfe - .word 0x012ffd85 - .word 0xdf1a6c21 - .word 0x43190552 - .word 0xffffffff - .word 0xfffffffe - .word 0xffffffff - .word 0x00000000 - -/* P-256 basepoint G affine x-coordinate */ -.globl p256_gx -.balign 32 -p256_gx: - .word 0xd898c296 - .word 0xf4a13945 - .word 0x2deb33a0 - .word 0x77037d81 - .word 0x63a440f2 - .word 0xf8bce6e5 - .word 0xe12c4247 - .word 0x6b17d1f2 - -/* P-256 basepoint G affine y-coordinate */ -.globl p256_gy -.balign 32 -p256_gy: - .word 0x37bf51f5 - .word 0xcbb64068 - .word 0x6b315ece - .word 0x2bce3357 - .word 0x7c0f9e16 - .word 0x8ee7eb4a - .word 0xfe1a7f9b - .word 0x4fe342e2
diff --git a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256_ecdsa.s b/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256_ecdsa.s deleted file mode 100644 index 7c43d22..0000000 --- a/sw/device/silicon_creator/lib/crypto/ecdsa_p256/p256_ecdsa.s +++ /dev/null
@@ -1,123 +0,0 @@ -/* Copyright lowRISC contributors. */ -/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ -/* SPDX-License-Identifier: Apache-2.0 */ - -/** - * Elliptic curve P-256 ECDSA - * - * Uses OTBN ECC P-256 lib to perform an ECDSA operations. - */ - -.section .text.start -.globl start -start: - /* Read mode, then tail-call either p256_ecdsa_sign or p256_ecdsa_verify */ - la x2, mode - lw x2, 0(x2) - - li x3, 1 - beq x2, x3, p256_ecdsa_sign - - li x3, 2 - beq x2, x3, p256_ecdsa_verify - - /* Mode is neither 1 (= sign) nor 2 (= verify). Fail. */ - unimp - -.text -p256_ecdsa_sign: - jal x1, p256_ecdsa_setup_rand - jal x1, p256_sign - ecall - -p256_ecdsa_verify: - jal x1, p256_verify - ecall - -/** - * Populate the variables rnd and k with randomness, and setup data pointers. - */ -p256_ecdsa_setup_rand: - /* Obtain the blinding constant from URND, and write it to `rnd` in DMEM. */ - bn.wsrr w0, 0x2 /* URND */ - la x10, rnd - bn.sid x0, 0(x10) - - /* Point dptr_rnd to rnd. */ - la x11, dptr_rnd - sw x10, 0(x11) - - /* Obtain the nonce (k) from RND. */ - bn.wsrr w0, 0x1 /* RND */ - la x10, k - bn.sid x0, 0(x10) - - /* Point dptr_k to k. */ - la x11, dptr_k - sw x10, 0(x11) - - ret - -.data - -/* Freely available DMEM space. */ - -/* Operation mode (1 = sign; 2 = verify) */ -.globl mode -.balign 4 -mode: - .zero 4 - -/* All constants below must be 256b-aligned. */ - -/* random scalar k */ -.balign 32 -k: - .zero 32 - -/* randomness for blinding */ -.balign 32 -rnd: - .zero 32 - -/* message digest */ -.globl msg -.balign 32 -msg: - .zero 32 - -/* signature R */ -.globl r -.balign 32 -r: - .zero 32 - -/* signature S */ -.globl s -.balign 32 -s: - .zero 32 - -/* public key x-coordinate */ -.globl x -.balign 32 -x: - .zero 32 - -/* public key y-coordinate */ -.globl y -.balign 32 -y: - .zero 32 - -/* private key d */ -.globl d -.balign 32 -d: - .zero 32 - -/* verification result x_r (aka x_1) */ -.globl x_r -.balign 32 -x_r: - .zero 32