blob: 21aaeff1fec9d377c3f31ad18a3bf91d7e0a06db [file] [log] [blame]
/* Copyright lowRISC contributors. */
/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
/* SPDX-License-Identifier: Apache-2.0 */
/*
* P-384 specific routines for ECDSA signature generation and constant-time
* scalar multiplication.
*/
.section .text
/**
* Convert projective coordinates of a P-384 curve point to affine coordinates
*
* returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
* where P is a valid P-384 curve point,
* x_a and y_a are the resulting affine coordinates of the
* curve point,
* x,y and z are a set of projective coordinates of the
* point and
* p is the modulus of the P-384 underlying finite field.
*
* This routine computes the affine coordinates for a set of projective
* coordinates of a valid P-384 curve point. The routine performs the required
* divisions by computing the multiplicative modular inverse of the
* projective z-coordinate in the underlying finite field of the P-384 curve.
* For inverse computation Fermat's little theorem is used, i.e.
* we compute z^-1 = z^(p-2) mod p.
* For exponentiation a 16 step addition chain is used.
* Source of the addition chain is the addchain project:
* https://github.com/mmcloughlin/addchain/
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* @param[in] [w26,w25]: x, x-coordinate of curve point (projective).
* @param[in] [w26,w25]: y, y-coordinate of curve point (projective).
* @param[in] [w30,w29]: z, z-coordinate of curve point (projective).
* @param[in] [w13, w12]: p, modulus of P-384.
* @param[in] w31: all-zero.
* @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
* @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
*
* clobbered registers: w0 to w28
* clobbered flag groups: FG0
*/
proj_to_affine_p384:
/* Exp: 0b10 = 2*0b1
Val: r10 = z^2 mod p
[w17,w16] <= [w30,w29]^2 mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
bn.mov w16, w29
bn.mov w17, w30
jal x1, p384_mulmod_p
/* Exp: 0b11 = 0b1+0b10
Val: r11 <= z*r10 mod p
[w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_p
/* Exp: 0b110 = 2*0b11
Val: r110 = r11^2 mod p
[w17,w16] <= [w17,w16]^2 mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
/* Exp: 0b111 = 0b1+0b110
Val: r111 <= z*r110 mod p
[w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_p
bn.mov w0, w16
bn.mov w1, w17
/* Exp: 0b111000 = 0b111<<3
Val: r111000 <= r111^(2^3) mod p
[w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
loopi 3, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
/* Exp: 0b1111111 = 0b111+0b111000
Val: r1111111 <= r111*r111000 mod p
[w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
bn.mov w10, w0
bn.mov w11, w1
jal x1, p384_mulmod_p
bn.mov w2, w16
bn.mov w3, w17
/* Exp: 2^12-1 = (0b1111111<<6)+0b111111
Val: r_12_1 <= r111111^(2^6)*r111111 mod p
[w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
loopi 6, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, p384_mulmod_p
bn.mov w4, w16
bn.mov w5, w17
/* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
[w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
loopi 12, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w4
bn.mov w11, w5
jal x1, p384_mulmod_p
/* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
[w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
loopi 6, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, p384_mulmod_p
bn.mov w2, w16
bn.mov w3, w17
/* Exp: 2^31-1 <= (2^30-1)*2+0b1
Val: r_31_1 <= r30_1^2*z mod p
[w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_p
bn.mov w6, w16
bn.mov w7, w17
/* Exp: 2^32-1 <= (2^30-1)*2+0b1
Val: r_32_1 <= r31_1^2*z mod p
[w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_p
bn.mov w9, w16
bn.mov w8, w17
/* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
[w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
loopi 31, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, p384_mulmod_p
bn.mov w6, w16
bn.mov w7,w17
/* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
[w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
loopi 63, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, p384_mulmod_p
bn.mov w6, w16
bn.mov w7, w17
/* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
[w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
loopi 126, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w6
bn.mov w11, w7
jal x1, p384_mulmod_p
/* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
[w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
loopi 3, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w0
bn.mov w11, w1
jal x1, p384_mulmod_p
/* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
[w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
*[w30,w29] mod [w13,w12] */
loopi 33, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w9
bn.mov w11, w8
jal x1, p384_mulmod_p
loopi 94, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w2
bn.mov w11, w3
jal x1, p384_mulmod_p
loopi 2, 4
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_p
nop
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_p
/* store inverse [w1,w0] <= [w17,w16] = z_inv*/
bn.mov w0, w16
bn.mov w1, w17
/* convert x-coordinate to affine space
[w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
bn.mov w10, w25
bn.mov w11, w26
jal x1, p384_mulmod_p
bn.mov w25, w16
bn.mov w26, w17
/* convert y-coordinate to affine space
[w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
bn.mov w10, w27
bn.mov w11, w28
bn.mov w16, w0
bn.mov w17, w1
jal x1, p384_mulmod_p
bn.mov w27, w16
bn.mov w28, w17
ret
/**
* Fetch curve point from dmem, randomize z-coordinate and store point in dmem
*
* returns P = (x, y, z) = (x_a*z, y_a*z, z)
* with P being a valid P-384 curve point in projective coordinates
* x_a and y_a being the affine coordinates as fetched from dmem
* z being a randomized z-coordinate
*
* This routines fetches the affine x- and y-coordinates of a curve point from
* dmem and computes a valid set of projective coordinates. The z-coordinate is
* randomized and x and y are scaled appropriately. The resulting projective
* coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
* i.e. each coordinate is stored 512 bit aligned, little endian.
* This routine runs in constant time.
*
* @param[in] x20: dptr_x, pointer to dmem location containing affine
* x-coordinate of input point
* @param[in] x21: dptr_y, pointer to dmem location containing affine
* y-coordinate of input point
* @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
* modulus p
* @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
* @param[in] w31: all-zero
* @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point
* in projective space
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
* the upper limb of projective y-coordinate.
*
* clobbered registers: x10, x11 to x13
* w2, w3, w8 to w11, w16 to w24, w29, w30
* clobbered flag groups: FG0
*/
store_proj_randomize:
/* get a 384-bit random number from URND
[w3, w2] = random(384) */
bn.wsrr w2, 2
bn.wsrr w3, 2
bn.rshi w3, w31, w3 >> 128
/* reduce random number
[w2, w3] = z <= [w2, w3] mod p */
bn.sub w10, w2, w12
bn.subb w11, w3, w13
bn.sel w2, w2, w10, C
bn.sel w3, w3, w11, C
bn.mov w10, w2
bn.mov w11, w3
/* store z-coordinate
dmem[x20+128] = [w10, w11] */
li x10, 10
li x11, 11
bn.sid x10, 128(x18)
bn.sid x11, 160(x18)
/* fetch x-coordinate from dmem
[w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
li x12, 16
li x13, 17
bn.lid x12, 0(x20)
bn.lid x13, 32(x20)
/* scale and store x-coordinate
[dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
x_p <= [w11, w10] * [w17, w16] = z*x mod p */
jal x1, p384_mulmod_p
bn.sid x12, 0(x18)
bn.sid x13, 32(x18)
/* fetch y-coordinate from dmem
[w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
bn.lid x12, 0(x21)
bn.lid x13, 32(x21)
/* scale and store y-coordinate
[dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
y_p <= [w11, w10] * [w17, w16] = z*y mod p */
bn.mov w10, w2
bn.mov w11, w3
jal x1, p384_mulmod_p
bn.sid x12, 64(x18)
bn.sid x13, 96(x18)
ret
/**
* P-384 scalar point multiplication in affine space
*
* returns R = k*P = k*(x_p, y_p)
* where R, P are valid P-384 curve points in affine coordinates,
* k is a 384-bit scalar.
*
* This routine performs scalar multiplication based on the group laws
* of Weierstrass curves.
* A constant time double-and-add algorithm (sometimes referred to as
* double-and-add-always) is used.
* Due to the P-384 optimized implementations of the internally called routines
* for point addition and doubling, this routine is limited to P-384 curves.
* The routine makes use of blinding by additive splitting the
* exponent/scalar d into two shares. The double-and-add loop operates on both
* shares in parallel applying Shamir's trick.
*
* @param[in] x9: dptr_rnd, pointer to location in dmem containing random
* number to be used for additive splitting of scalar
* @param[in] x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
* @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem
* @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem
* @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem
* @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
* @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
* @param[in] [w11, w10]: n, domain parameter of P-384 curve
* (order of base point G)
* @param[in] w31: all-zero
* @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
* @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
*
* Scratchpad memory layout:
* The routine expects at least 704 bytes of scratchpad memory at dmem
* location 'scratchpad' (sp). Internally the scratchpad is used as follows:
* dptr_sp .. dptr_sp+191: point P, projective
* dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
* dptr_sp+256 .. dptr_sp+447: point 2P, projective
* dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
* dptr_sp+512 .. dptr_sp+703: point Q, projective
*
* Projective coordinates of a point are kept in dmem in little endian format
* with the individual coordinates 512 bit aligned. The coordinates are stored
* in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
* curve point occupies 6 consecutive 256-bit dmem cells.
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
* the computed affine y-coordinate.
*
* clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
* clobbered flag groups: FG0
*/
scalar_mult_int_p384:
/* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
resetting in very call to point addition routine */
li x22, 10
li x23, 11
li x24, 16
li x25, 17
/* fetch externally supplied random number from dmem
[w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
li x2, 0
bn.lid x2++, 0(x9)
bn.lid x2++, 32(x9)
/* 1st share (reduced rnd)
s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
bn.sub w9, w0, w10
bn.subb w8, w1, w11
bn.sel w0, w0, w9, C
bn.sel w1, w1, w8, C
/* load scalar k from dmem
[w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
bn.lid x2++, 0(x19)
bn.lid x2, 32(x19)
/* 2nd share (k-s0)
s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
bn.sub w2, w2, w0
bn.subb w3, w3, w1
bn.add w8, w2, w10
bn.addc w9, w3, w11
bn.sel w2, w8, w2, C
bn.sel w3, w9, w3, C
/* left align both shares for probing of MSB in loop body */
bn.rshi w1, w1, w0 >> 128
bn.rshi w0, w0, w31 >> 128
bn.rshi w3, w3, w2 >> 128
bn.rshi w2, w2, w31 >> 128
/* store shares in scratchpad */
li x2, 0
bn.sid x2++, 192(x30)
bn.sid x2++, 224(x30)
bn.sid x2++, 448(x30)
bn.sid x2++, 480(x30)
/* get randomized projective coodinates of curve point
P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
add x18, x30, 0
jal x1, store_proj_randomize
/* double point P
2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
add x27, x30, x0
add x26, x30, x0
jal x1, proj_add_p384
/* store point 2P in scratchpad @w30+256
dmem[dptr_sc+256] = [w30:w25] = 2P */
li x2, 25
bn.sid x2++, 256(x30)
bn.sid x2++, 288(x30)
bn.sid x2++, 320(x30)
bn.sid x2++, 352(x30)
bn.sid x2++, 384(x30)
bn.sid x2++, 416(x30)
/* init point Q = (0,1,0) for double-and-add in scratchpad */
/* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
addi x26, x30, 512
li x2, 30
bn.addi w30, w31, 1
bn.sid x2++, 64(x26)
bn.sid x2, 0(x26)
bn.sid x2, 32(x26)
bn.sid x2, 96(x26)
bn.sid x2, 128(x26)
bn.sid x2, 160(x26)
/* double-and-add loop with decreasing index */
loopi 384, 85
/* double point Q
Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
add x27, x26, x0
jal x1, proj_add_p384
/* store Q in dmem
dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
li x2, 25
bn.sid x2++, 0(x26)
bn.sid x2++, 32(x26)
bn.sid x2++, 64(x26)
bn.sid x2++, 96(x26)
bn.sid x2++, 128(x26)
bn.sid x2++, 160(x26)
/* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
is 1.
If only one MSb is set, select P for addition.
If both MSbs are set, select 2P for addition.
(If neither MSB is set, 2P will be selected but result discarded.) */
li x2, 0
bn.lid x2++, 224(x30)
bn.lid x2, 480(x30)
bn.xor w8, w0, w1
/* Create conditional offeset into scratchpad.
if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
csrrs x3, 0x7c0, x0
xori x3, x3, -1
andi x3, x3, 2
slli x27, x3, 7
add x27, x27, x30
/* Reload randomized projective coodinates for curve point P.
P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
jal x1, store_proj_randomize
/* Add points Q+P or Q+2P depending on offset in x27.
Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
jal x1, proj_add_p384
/* load shares from scratchpad
[w1, w0] = s0; [w3, w2] = s1 */
li x2, 0
bn.lid x2++, 192(x30)
bn.lid x2++, 224(x30)
bn.lid x2++, 448(x30)
bn.lid x2++, 480(x30)
/* M = s0[511] | s1[511] */
bn.or w8, w1, w3
/* load q from scratchpad
Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
li x2, 4
bn.lid x2++, 0(x26)
bn.lid x2++, 32(x26)
bn.lid x2++, 64(x26)
bn.lid x2++, 96(x26)
bn.lid x2++, 128(x26)
bn.lid x2++, 160(x26)
/* select either Q or Q_a
if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
bn.sel w25, w25, w4, M
bn.sel w26, w26, w5, M
bn.sel w27, w27, w6, M
bn.sel w28, w28, w7, M
bn.sel w29, w29, w8, M
bn.sel w30, w30, w9, M
/* store Q in dmem
dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
li x2, 25
bn.sid x2++, 0(x26)
bn.sid x2++, 32(x26)
bn.sid x2++, 64(x26)
bn.sid x2++, 96(x26)
bn.sid x2++, 128(x26)
bn.sid x2++, 160(x26)
/* left shift both shares
s0 <= s0 << 1 ; s1 <= s1 << 1 */
bn.add w0, w0, w0
bn.addc w1, w1, w1
bn.add w2, w2, w2
bn.addc w3, w3, w3
/* store both shares in scratchpad */
li x2, 0
bn.sid x2++, 192(x30)
bn.sid x2++, 224(x30)
bn.sid x2++, 448(x30)
bn.sid x2++, 480(x30)
/* Get a fresh random number from URND and scale the coordinates of 2P.
(scaling each proj. coordinate by same factor results in same point) */
/* get a 384-bit random number from URND */
bn.wsrr w2, 2
bn.wsrr w3, 2
bn.rshi w3, w31, w3 >> 128
/* reduce random number
[w2, w3] = z <= [w2, w3] mod p */
bn.sub w10, w2, w12
bn.subb w11, w3, w13
bn.sel w2, w2, w10, C
bn.sel w3, w3, w11, C
/* scale all coordinates in scratchpad */
li x2, 16
li x3, 17
/* x-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 256(x30)
bn.lid x3, 288(x30)
jal x1, p384_mulmod_p
bn.sid x2, 256(x30)
bn.sid x3, 288(x30)
/* y-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 320(x30)
bn.lid x3, 352(x30)
jal x1, p384_mulmod_p
bn.sid x2, 320(x30)
bn.sid x3, 352(x30)
/* z-coordinate */
bn.mov w10, w2
bn.mov w11, w3
bn.lid x2, 384(x30)
bn.lid x3, 416(x30)
jal x1, p384_mulmod_p
bn.sid x2, 384(x30)
bn.sid x3, 416(x30)
/* convert coordinates to affine space */
jal x1, proj_to_affine_p384
ret
/**
* Externally callable wrapper for P-384 scalar point multiplication
*
* returns R = k*P = k*(x_p, y_p)
* where R, P are valid P-384 curve points in affine coordinates,
* k is a 384-bit scalar..
*
* Sets up context and calls the internal scalar multiplication routine.
* This routine runs in constant time.
*
* @param[in] dmem[0]: dK, pointer to location in dmem containing scalar k
* @param[in] dmem[4]: dRnd, pointer to location in dmem containing random
* number for blinding
* @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
* @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
*
* 384-bit quantities have to be provided in dmem in little-endian format,
* 512 bit aligned, with the highest 128 bit set to zero.
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
* the computed affine y-coordinate.
*
* clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
* w0 to w30
* clobbered flag groups: FG0
*/
.globl scalar_mult_p384
scalar_mult_p384:
/* set dmem pointer to point x-coordinate */
la x20, dptr_x
lw x20, 0(x20)
/* set dmem pointer to point y-coordinate */
la x21, dptr_y
lw x21, 0(x21)
/* set dmem pointer to scalar k */
la x19, dptr_k
lw x19, 0(x19)
/* set pointer to blinding parameter */
la x9, dptr_rnd
lw x9, 0(x9)
/* set dmem pointer to domain parameter b */
la x28, p384_b
/* set dmem pointer to scratchpad */
la x30, scratchpad
/* load domain parameter p (modulus)
[w13, w12] = p = dmem[p384_p] */
li x2, 12
la x3, p384_p
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* load domain parameter n (order of base point)
[w11, w10] = n = dmem[p384_n] */
li x2, 10
la x3, p384_n
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* init all-zero reg */
bn.xor w31, w31, w31
jal x1, scalar_mult_int_p384
/* store result in dmem */
li x2, 25
bn.sid x2++, 0(x20)
bn.sid x2++, 32(x20)
bn.sid x2++, 0(x21)
bn.sid x2++, 32(x21)
ret
/**
* Externally callable routine for P-384 base point multiplication
*
* returns Q = d (*) G
* where Q is a resulting valid P-384 curve point in affine
* coordinates,
* G is the base point of curve P-384, and
* d is a 384-bit scalar.
*
* Sets up context and calls the internal scalar multiplication routine.
* This routine runs in constant time.
*
* @param[in] dmem[0]: dptr_d, pointer to location in dmem containing
* scalar d.
* @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate
* @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate
* @param[in] dmem[28]: dptr_rnd, pointer to location in dmem containing
* random number for blinding.
*
* 384-bit quantities have to be provided in dmem in little-endian format,
* 512 bit aligned, with the highest 128 bit set to zero.
*
* Flags: When leaving this subroutine, the M, L and Z flags of FG0 correspond
* to the computed affine y-coordinate.
*
* clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
* w0 to w30
* clobbered flag groups: FG0
*/
.globl p384_base_mult
p384_base_mult:
/* set dmem pointer to x-coordinate of base point*/
la x20, p384_gx
/* set dmem pointer to y-coordinate of base point */
la x21, p384_gy
/* set dmem pointer to scalar d */
la x19, dptr_d
lw x19, 0(x19)
/* set pointer to blinding parameter */
la x9, dptr_rnd
lw x9, 0(x9)
/* set dmem pointer to domain parameter b */
la x28, p384_b
/* set dmem pointer to scratchpad */
la x30, scratchpad
/* load domain parameter p (modulus)
[w13, w12] = p = dmem[p384_p] */
li x2, 12
la x3, p384_p
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* load domain parameter n (order of base point)
[w11, w10] = n = dmem[p384_n] */
li x2, 10
la x3, p384_n
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* init all-zero reg */
bn.xor w31, w31, w31
jal x1, scalar_mult_int_p384
/* set dmem pointer to point x-coordinate */
la x20, dptr_x
lw x20, 0(x20)
/* set dmem pointer to point y-coordinate */
la x21, dptr_y
lw x21, 0(x21)
/* store result in dmem */
li x2, 25
bn.sid x2++, 0(x20)
bn.sid x2++, 32(x20)
bn.sid x2++, 0(x21)
bn.sid x2++, 32(x21)
ret
/**
* Variable-time modular multiplicative inverse computation
*
* returns x_inv = x^-1 mod m
*
* This routine computes the modular multiplicative inverse for any x < m in
* the finite field GF(m) where m is prime.
*
* For inverse computation, Fermat's little theorem is used, i.e.
* we compute x^-1 = x^(m-2) mod m.
* For exponentiation we use a standard, variable-time (!) square and multiply
* algorithm.
*
* This routine is mainly intended to be used for inversion of scalars in
* context of the P-384 curve. In theory, it can be used with any 384-bit
* modulus m with a corresponding 385-bit Barrett constant u,
* where u[383:192] = 0.
*
* Note: When used for P-384 scalar inversion, the routine will need 672 calls
* to the multiplication routine. By using an adder chain this could be reduced
* to ~433 multiplications, however, at the cost of a significant codes size
* increase.
*
* Note: This routine runs in variable-time w.r.t. the modulus. It should only
* be used with a non-secret modulus.
*
* @param[in] [w13, w12]: m, 384 bit modulus
* @param[in] w14: k, Solinas constant (2^384 - m) (max. length 191 bits).
* @param[in] [w30, w29]: x, 384 bit operand
* @param[in] w31, all-zero
* @param[out] [w17, w16]: x_inv, modular multiplicative inverse
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* clobbered registers: x2, w2, w3, w10, w11, w16 to w24
* clobbered flag groups: FG0
*/
mod_inv_n_p384:
/* subtract 2 from modulus for Fermat's little theorem
[w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
bn.subi w2, w12, 2
bn.subb w3, w13, w31
bn.rshi w3, w3, w2 >> 128
bn.rshi w2, w2, w31 >> 128
/* init square and multiply: [w17,w16] = 1 */
bn.addi w16, w31, 1
bn.mov w17, w31
/* square and multiply loop */
loopi 384, 12
/* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
bn.mov w10, w16
bn.mov w11, w17
jal x1, p384_mulmod_n
/* shift MSB into carry flag
[w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
bn.add w2, w2, w2
bn.addc w3, w3, w3
/* skip multiplication if C flag not set */
csrrs x2, 0x7c0, x0
andi x2, x2, 1
beq x2, x0, nomul
/* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
bn.mov w10, w29
bn.mov w11, w30
jal x1, p384_mulmod_n
nomul:
nop
ret
/**
* P-384 ECDSA signature generation
*
* returns the signature as the pair r, s with
* r = x_1 mod n
* and s = k^(-1)(msg + r*d) mod n
* where x_1 is the affine x-coordinate of the curve point k*G,
* G is the curve's base point,
* k is a supplied secret random number,
* n is the order of the base point G of P-256,
* msg is the message to be signed, and
* d is the private key.
*
* This routine runs in constant time.
*
* @param[in] dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
* @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing
* a 384-bit random number for blinding
* @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem
* @param[in] dmem[12]: dptr_r, pointer to dmem location where s component
* of signature will be placed
* @param[in] dmem[16]: dptr_s, pointer to dmem location where r component
* of signature will be placed
* @param[in] dmem[28]: dptr_d, pointer to private key d in dmem
*
* Flags: Flags have no meaning beyond the scope of this subroutine.
*
* clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
* w0 to w31
* clobbered flag groups: FG0
*/
.globl p384_sign
p384_sign:
/* init all-zero reg */
bn.xor w31, w31, w31
/* set dmem pointer to domain parameter b */
la x28, p384_b
/* set dmem pointer to base point x-coordinate */
la x20, p384_gx
/* set dmem pointer to base point y-coordinate */
la x21, p384_gy
/* set dmem pointer to secret random scalar k */
la x19, dptr_k
lw x19, 0(x19)
/* set pointer to blinding parameter */
la x9, dptr_rnd
lw x9, 0(x9)
/* set dmem pointer to scratchpad */
la x30, scratchpad
/* load domain parameter p (modulus)
[w13, w12] <= p = dmem[dptr_p] */
li x2, 12
la x3, p384_p
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* load domain parameter n (order of base point)
[w11, w10] = n = dmem[p384_n] */
li x2, 10
la x3, p384_n
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* scalar multiplication with base point
[w28:w25] <= (x_1, y_1) = k*G */
jal x1, scalar_mult_int_p384
/* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
li x2, 25
la x3, dptr_r
lw x3, 0(x3)
bn.sid x2++, 0(x3)
bn.sid x2++, 32(x3)
/* load secret random number k from dmem
[w30,w29] <= k = dmem[dptr_k] */
li x2, 29
bn.lid x2++, 0(x19)
bn.lid x2++, 32(x19)
/* load domain parameter n (order of base point)
[w13, w12] <= p = dmem[p384_n] */
li x2, 12
la x3, p384_n
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
no need to compute the high part):
w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
bn.sub w14, w31, w12
/* modular multiplicative inverse of k
[w3, w2] <= [w17, w16] <= k^(-1) mod n */
jal x1, mod_inv_n_p384
bn.mov w2, w16
bn.mov w3, w17
/* load private key d from dmem
[w11,w10] <= d = dmem[dptr_d] */
li x2, 10
la x3, dptr_d
lw x3, 0(x3)
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
jal x1, p384_mulmod_n
/* [w5, w4] <= [w17, w16]
<= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
bn.mov w10, w25
bn.mov w11, w26
jal x1, p384_mulmod_n
bn.mov w4, w16
bn.mov w5, w17
/* load message from dmem
[w11, w10] <= msg = dmem[dptr_msg] */
li x2, 10
la x3, dptr_msg
lw x3, 0(x3)
bn.lid x2++, 0(x3)
bn.lid x2++, 32(x3)
/* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
bn.mov w16, w2
bn.mov w17, w3
jal x1, p384_mulmod_n
/* [w28, w27] <= s' = k^(-1)*msg + k^(-1)*r*d = [w17, w16] + [w5, w4]*/
bn.add w27, w16, w4
bn.addc w28, w17, w5
/* reduce s: [w28, w27] <= s <= s' mod n = [w28, w27] mod [w13, w12] */
bn.sub w10, w27, w12
bn.subb w11, w28, w13
bn.sel w27, w27, w10, C
bn.sel w28, w28, w11, C
/* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
li x2, 27
la x3, dptr_s
lw x3, 0(x3)
bn.sid x2++, 0(x3)
bn.sid x2++, 32(x3)
ret
/* pointers and scratchpad memory */
.section .data
/* pointer to k (dptr_k) */
.globl dptr_k
dptr_k:
.zero 4
/* pointer to rnd (dptr_rnd) */
.globl dptr_rnd
dptr_rnd:
.zero 4
/* pointer to msg (dptr_msg) */
.globl dptr_msg
dptr_msg:
.zero 4
/* pointer to R (dptr_r) */
.globl dptr_r
dptr_r:
.zero 4
/* pointer to S (dptr_s) */
.globl dptr_s
dptr_s:
.zero 4
/* pointer to X (dptr_x) */
.globl dptr_x
dptr_x:
.zero 4
/* pointer to Y (dptr_y) */
.globl dptr_y
dptr_y:
.zero 4
/* pointer to D (dptr_d) */
.globl dptr_d
dptr_d:
.zero 4
/* 704 bytes of scratchpad memory */
scratchpad:
.zero 704