blob: 7b50b41b27bb416605491ad28e5f2a0860690b27 [file] [log] [blame]
/* Copyright lowRISC Contributors.
* Copyright 2016 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.dcrypto file.
*
* Derived from code in
* https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_bn.c
*/
.text
.globl modexp_65537
.globl modexp
.globl modload
/**
* Precomputation of a constant m0' for Montgomery modular arithmetic
*
* Word-wise Montgomery modular arithmetic requires a quantity m0' to be
* precomputed once per modulus M. m0' is the negative of the
* modular multiplicative inverse of the lowest limb m0 of the modulus M, in
* the field GF(2^w), where w is the number of bits per limb. w is set to 256
* in this subroutine.
*
* Returns: m0' = -m0^(-1) mod 2^256
* with m0 being the lowest limb of the modulus M
*
* This subroutine implements the Dusse-Kaliski method for computing the
* multiplicative modular inverse when the modulus is of the form 2^k.
* [Dus] DOI https://doi.org/10.1007/3-540-46877-3_21 section 3.2
* (Algorithm "Modular Inverse" on p. 235)
*
* Flags: When leaving this subroutine, flags of FG0 depend on a
* the final subtraction and can be used if needed.
* FG0.M, FG0.L, FG0.Z depend directly on the value of the result m0'.
* FG0.C is always set.
* FG1 is not modified in this subroutine.
*
* @param[in] w28: m0, the lowest 256 bit limb of the modulus M
* @param[in] w31: all-zero.
* @param[out] w29: m0', negative of inverse of m0 in GF(2^256)
*
* clobbered registers: w0, w1, w29
* clobbered flag groups: FG0
*/
m0inv:
/* w0 keeps track of loop iterations in one-hot encoding, i.e.
w0 = 2^i in the loop body below and initialized here with w0 = 1
It is used for both the comparison in step 4 of [Dus] and the
addition in step 6 of [Dus] */
bn.xor w0, w0, w0
bn.addi w0, w0, 1
/* according to [Dus] the result variable y is initialized with 1 */
/* w29 = y_0 = 1 */
bn.mov w29, w0
/* iterate over all 256 bits of m0.
i refers to the loop cycle 0..255 in the loop body below. */
loopi 256, 13
/* y_i <= m*y_{i-1] */
bn.mulqacc.z w28.0, w29.0, 0
bn.mulqacc w28.1, w29.0, 64
bn.mulqacc.so w1.L, w28.0, w29.1, 64
bn.mulqacc w28.2, w29.0, 0
bn.mulqacc w28.1, w29.1, 0
bn.mulqacc w28.0, w29.2, 0
bn.mulqacc w28.3, w29.0, 64
bn.mulqacc w28.2, w29.1, 64
bn.mulqacc w28.1, w29.2, 64
bn.mulqacc.so w1.U, w28.0, w29.3, 64
/* This checks if w1 = y_i = m0*y_(i-1) < 2^(i-1) mod 2^i
Due to the mathematical properties it can be shown that y_i at this point,
is either 1 or (10..0..01)_(i). Therefore, just probing the i_th bit is
the same as the full compare. */
bn.and w1, w1, w0
/* Compute
y_i=w29 <= w1=m0*y_(i-1) < 2^(i-1) mod 2^i y_i ? : y_{i-1}+2^i : y_{i-1}
there cannot be overlaps => or'ing is as good as adding */
bn.or w29, w29, w1
/* double w0 (w0 <= w0 << 1) i.e. w0=2^i */
bn.add w0, w0, w0
/* finally, compute m0' (negative of inverse)
w29 = m0' = -(m0^-1) mod 2^256 = -y_255 = 0 - y_255 = w31 - w29 */
bn.sub w29, w31, w29
ret
/**
* Constant time conditional subtraction of modulus from a bigint
*
* Returns C <= C-s*M
* with C being a bigint of length 256..4096 bit
* M being the modulus of length 256..4096 bit
* s being a boolean value [0,1]
*
* Conditionally subtracts the modulus located in dmem from the bigint
* located in a buffer in the wide regfile (starting at w5). The subtracted
* value is selected when FG1.C equals 1, otherwise the unmodified value is
* selected.
*
* Note that the interpretation of the subtrahend as a modulus is only
* contextual. In theory, it can be any bigint. However, the subtrahend is
* expected in dmem at a location that is reserved for the modulus according
* to the calling conventions within this library.
*
* Flags: When leaving this subroutine, flags of FG0 depend on a
* potentially discarded value and therefore are not usable after
* return.
* FG1 is not modified in this subroutine.
*
* @param[in] x16: dptr_m, pointer to 1st limb of modulus M
* @param[in] x30: N, number of 256 bit limbs in modulus and bigint
* @param[in] w31: all-zero
* @param[in] FG1.C: s, selection flag
* @param[out] [w[5+N-1]:w5]: new bigint value
* @param[in] FG0.C: needs to be set to 0
*
* clobbered registers: x8, x10, x11, x16, w2, w3, w4, w5 to w[5+N-1]
* clobbered flag groups: FG0
*/
cond_sub_mod:
/* setup pointers */
li x8, 5
li x10, 3
li x11, 2
/* reset flags for FG0 */
bn.add w31, w31, w31
/* iterate over all limbs for limb-wise subtraction + conditional selection*/
loop x30, 5
/* load a limb of modulus from dmem to w3 */
bn.lid x10, 0(x16++)
/* load the limb of bigint buffer to w2 */
bn.movr x11, x8
/* subtract the current limb of the modulus from current limb of bigint */
bn.subb w4, w2, w3
/* conditionally select subtraction result or unmodified limb */
bn.sel w3, w4, w2, FG1.C
/* move back result from w3 to bigint buffer */
bn.movr x8++, x10
ret
/**
* Compute square of Montgomery modulus
*
* Returns RR = R^2 mod M
* with M being the Modulus of length 256..4096 bit
* R = 2^(256*N), N is the number of limbs per bigint
*
* The squared Montgomery modulus (RR) is needed to transform bigints to and
* from the Montgomery domain.
*
* RR is computed in this subroutine by iteratively doubling and reduction.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* @param[in] x16: dptr_M, pointer to first limb of modulus in dmem
* @param[in] x18: dptr_RR: dmem pointer to first limb of output buffer for RR
* @param[in] x30: N, number of limbs
* @param[in] w31: all-zero
* @param[out] dmem[dptr_RR+N*32:dptr_RR]: computed RR
*
* clobbered registers: x3, x8, x10, x11, x22
* w0, w2, w3, w4, w5 to w20 depending on N
* clobbered flag groups: FG0, FG1
*/
compute_rr:
/* save pointer to modulus */
addi x22, x16, 0
/* zeroize w3 */
bn.xor w3, w3, w3
/* compute full length of current bigint size in bits
N*w = x24 = N*256 = N*2^8 = x30 << 8 */
slli x24, x30, 8
/* reg pointers */
li x8, 5
li x10, 3
/* zeroize w3 */
bn.xor w3, w3, w3
/* zeroize all limbs of bigint in regfile */
loop x30, 1
bn.movr x8++, x10
/* compute R-M
since R = 2^(N*w), this can be computed as R-M = unsigned(0-M) */
bn.addi w0, w31, 1
bn.sub w3, w31, w0, FG1
addi x16, x22, 0
jal x1, cond_sub_mod
/* Compute R^2 mod M = R*2^(N*w) mod M.
=> R^2 mod M can be computed by performing N*w duplications of R.
We directly perform a modulo reduction in each step such that the
final result will already be reduced. */
loop x24, 18
/* reset pointer */
li x8, 5
/* zeroize w3 reset flags of FG1 */
bn.sub w3, w3, w3, FG1
/* Duplicate the intermediate bigint result. This can overflow such that
bit 2^(N*w) (represented by the carry bit after the final loop cycle)
is set. */
loop x30, 3
/* copy current limb of bigint to w2 */
bn.movr x11, x8
/* perform the doubling */
bn.addc w2, w2, w2, FG1
/* copy result back to bigint in regfile */
bn.movr x8++, x11
/* Conditionally subtract the modulus from the current bigint Y if there
was an overflow. Again, just considering the lowest N*w bits is
sufficient, since (in case of an overflow) we can write
2*Y as 2^(N*w) + X with M > X >= 0.
Then, 2*Y - M = 2^(N*w) + X - M = X + unsigned(0-M) */
addi x16, x22, 0
jal x1, cond_sub_mod
/* reset pointer to 1st limb of bigint in regfile */
li x8, 5
/* reset pointer to modulus in dmem */
addi x16, x22, 0
/* reset flags of FG1 */
bn.sub w3, w3, w3, FG1
/* compare intermediate bigint y with modulus
subtract modulus if Y > M */
loop x30, 3
bn.lid x10, 0(x16++)
bn.movr x11, x8++
bn.cmpb w3, w2, FG1
addi x16, x22, 0
jal x1, cond_sub_mod
li x0, 0
/* reset pointer to 1st limb of bigint in regfile */
li x8, 5
/* reset pointer to modulus */
addi x16, x22, 0
/* store computed RR in dmem */
addi x3, x18, 0
loop x30, 2
bn.sid x8, 0(x3++)
addi x8, x8, 1
ret
/**
* Unrolled 512=256x256 bit multiplication.
*
* Returns c = a x b.
*
* Flags: No flags are set in this subroutine
*
* @param[in] w30: a, first operand
* @param[in] w25: b, second operand
* @param[out] [w26, w27]: c, result
*
* clobbered registers: w26, w27
* clobbered flag groups: none
*/
mul256_w30xw25:
bn.mulqacc.z w30.0, w25.0, 0
bn.mulqacc w30.1, w25.0, 64
bn.mulqacc.so w27.L, w30.0, w25.1, 64
bn.mulqacc w30.2, w25.0, 0
bn.mulqacc w30.1, w25.1, 0
bn.mulqacc w30.0, w25.2, 0
bn.mulqacc w30.3, w25.0, 64
bn.mulqacc w30.2, w25.1, 64
bn.mulqacc w30.1, w25.2, 64
bn.mulqacc.so w27.U, w30.0, w25.3, 64
bn.mulqacc w30.3, w25.1, 0
bn.mulqacc w30.2, w25.2, 0
bn.mulqacc w30.1, w25.3, 0
bn.mulqacc w30.3, w25.2, 64
bn.mulqacc.so w26.L, w30.2, w25.3, 64
bn.mulqacc.so w26.U, w30.3, w25.3, 0
ret
/**
* Unrolled 512=256x256 bit multiplication.
*
* Returns c = a x b.
*
* Flags: No flags are set in this subroutine
*
* @param[in] w30: a, first operand
* @param[in] w2: b, second operand
* @param[out] [w26, w27]: c, result
*
* clobbered registers: w26, w27
* clobbered flag groups: none
*/
mul256_w30xw2:
bn.mulqacc.z w30.0, w2.0, 0
bn.mulqacc w30.1, w2.0, 64
bn.mulqacc.so w27.L, w30.0, w2.1, 64
bn.mulqacc w30.2, w2.0, 0
bn.mulqacc w30.1, w2.1, 0
bn.mulqacc w30.0, w2.2, 0
bn.mulqacc w30.3, w2.0, 64
bn.mulqacc w30.2, w2.1, 64
bn.mulqacc w30.1, w2.2, 64
bn.mulqacc.so w27.U, w30.0, w2.3, 64
bn.mulqacc w30.3, w2.1, 0
bn.mulqacc w30.2, w2.2, 0
bn.mulqacc w30.1, w2.3, 0
bn.mulqacc w30.3, w2.2, 64
bn.mulqacc.so w26.L, w30.2, w2.3, 64
bn.mulqacc.so w26.U, w30.3, w2.3, 0
ret
/**
* Constant time conditional bigint subtraction
*
* Returns C <= C-s*B
* with B being a bigint of length 256..4096 bit
* C being a bigint of length 256..4096 bit
* s being a boolean value [0,1]
*
* Depending on state of FG1.C subtracts a bigint B located in dmem from
* another bigint C, located in the wide reg file and stores result at same
* position in wide reg file.
*
* Flags: When leaving this subroutine, flags of FG0 depend on a
* potentially discarded value and therefore are not usable after
* return. FG1 is not modified in this subroutine.
*
* @param[in] x16: dmem pointer to first limb of subtrahend (B)
* @param[in] x8: regfile pointer to first limb of minuend and result (C)
* @param[in] FG.C: s, subtraction flag, subtract if 1
* @param[in] x30: number of limbs
* @param[in] FG0.C: needs to be set to 0
*
* clobbered registers: x8, x16, w24, w29, w30, w[x8] to w[x8+N-1]
* clobbered Flag Groups: FG0
*/
cond_sub_to_reg:
/* load pointers to temp regs */
li x12, 30
li x13, 24
/* iterate over all limbs for conditional limb-wise subtraction */
loop x30, 6
/* load limb of subtrahend (input B) to w24 */
bn.lid x13, 0(x16++)
/* load limb of minuend (input C) to w30 */
bn.movr x12, x8
/* perform subtraction for a limb */
bn.subb w29, w30, w24
bn.movr x8, x13
/* conditionally select subtraction result or unmodified limb */
bn.sel w24, w29, w30, FG1.C
/* store selection result in reg file */
bn.movr x8++, x13
ret
/**
* Main loop body for constant-time Montgomery Modular Multiplication
*
* Returns: C <= (C + A*b_i + M*m0'*(C[0] + A[0]*b_i))/(2^WLEN) mod R
*
* This implements the main loop body for the Montgomery Modular Multiplication
* as well as the conditional subtraction. See e.g. Handbook of Applied
* Cryptography (HAC) 14.36 (steps 2.1 and 2.2) and step 3.
* This subroutine has to be called for every iteration of the loop in step 2
* of HAC 14.36, i.e. once per limb of operand B (x in HAC notation). The limb
* is supplied via w2. In the explanations below, the index i refers to the
* i_th call to this subroutine within one full Montgomery Multiplication run.
* Step 3 of HAC 14.36 is replaced by the approach to perform the conditional
* subtraction when the intermediate result is larger than R instead of m. See
* e.g. https://eprint.iacr.org/2017/1057 section 2.4.2 for a justification.
* This does not omit the conditional subtraction, but simplifies the
* comparison. The subtraction is carried out in constant time manner.
* Variable names of HAC are mapped as follows to the ones used in the
* this library: x=B, y=A, A=C, b=2^WLEN, m=M, R=R, m' = m0', n=N.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* @param[in] x16: dmem pointer to first limb of modulus M
* @param[in] x19: dmem pointer to first limb operand A
* @param[in] x31: N-1, number of limbs minus one
* @param[in] w2: current limb of operand B, b_i
* @param[in] w3: Montgomery constant m0'
* @param[in] w31: all-zero
* @param[in] [w[4+N-1]:w4] intermediate result A
* @param[out] [w[4+N-1]:w4] intermediate result A
*
* clobbered registers: x8, x10, x12, x13, x16, x19
* w24, w25, w26, w27, w28, w29, w30, w4 to w[4+N-1]
* clobbered Flag Groups: FG0, FG1
*/
mont_loop:
/* save pointer to modulus */
addi x22, x16, 0
/* pointers to temp. wregs */
li x12, 30
li x13, 24
/* buffer read pointer */
li x8, 4
/* buffer write pointer */
li x10, 4
/* load 1st limb of input y (operand a): w30 = y[0] */
bn.lid x12, 0(x19++)
/* This is x_i*y_0 in step 2.1 of HAC 14.36 */
/* [w26, w27] = w30*w2 = y[0]*x_i */
jal x1, mul256_w30xw2
/* w24 = w4 = A[0] */
bn.movr x13, x8++
/* add A[0]: [w29, w30] = [w26, w27] + w24 = y[0]*x_i + A[0] */
bn.add w30, w27, w24
/* this serves as c_xy in the first cycle of the loop below */
bn.addc w29, w26, w31
/* w25 = w3 = m0' */
bn.mov w25, w3
/* multiply by m0', this concludes Step 2.1 of HAC 14.36 */
/* [_, u_i] = [w26, w27] = w30*w25 = (y[0]*x_i + A[0])*m0'*/
jal x1, mul256_w30xw25
/* With the computation of u_i, the computations in a cycle 0 of the loop
below are already partly done. The following instructions (until the
start of the loop) implement the remainder, such that cycle 0 can be
omitted in the loop */
/* [_, u_i] = [w28, w25] = [w26, w27] */
bn.mov w25, w27
bn.mov w28, w26
/* w24 = w30 = y[0]*x_i + A[0] mod b */
bn.mov w24, w30
/* load first limb of modulus: w30 = m[0] */
bn.lid x12, 0(x16++)
/* [w26, w27] = w30*w25 = m[0]*u_i*/
jal x1, mul256_w30xw25
/* [w28, w27] = [w26, w27] + w24 = m[0]*u_i + (y[0]*x_i + A[0] mod b) */
bn.add w27, w27, w24
/* this serves as c_m in the first cycle of the loop below */
bn.addc w28, w26, w31
/* This loop implements step 2.2 of HAC 14.36 with a word-by-word approach.
The loop body is subdivided into two steps. Each step performs one
multiplication and subsequently adds two WLEN sized words to the
2WLEN-sized result, such that there are no overflows at the end of each
step-
Two carry words are required between the cycles. Those are c_xy and c_m.
Assume that the variable j runs from 1 to N-1 in the explanations below.
A cycle 0 is omitted, since the results from the computations above are
re-used */
loop x31, 14
/* Step 1: First multiplication takes a limb of each of the operands and
computes the product. The carry word from the previous cycle c_xy and
the j_th limb of the buffer A, A[j] are added to the multiplication
result.
/* load limb of y (operand a) and mult. with x_i: [w26, w27] <= y[j]*x_i */
bn.lid x12, 0(x19++)
jal x1, mul256_w30xw2
/* add limb of buffer: [w26, w27] <= [w26,w27] + w24 = y[j]*x_i + A[j] */
bn.movr x13, x8++
bn.add w27, w27, w24
bn.addc w26, w26, w31
/* add carry word from previous cycle:
[c_xy, a_tmp] = [w29, w24] <= [w26,w27] + w29 = y[j]*x_i + A[j] + c_xy*/
bn.add w24, w27, w29
bn.addc w29, w26, w31
/* Step 2: Second multiplication computes the product of a limb m[j] of
the modulus with u_i. The 2nd carry word from the previous loop cycle
c_m and the lower word a_tmp of the result of Step 1 are added. */
/* load limb m[j] of modulus and multiply with u_i:
[w26, w27] = w30*w25 = m[j+1]*u_i */
bn.lid x12, 0(x16++)
jal x1, mul256_w30xw25
/* add result from first step
[w26, w27] <= [w26,w27] + w24 = m[j+1]*u_i + a_tmp */
bn.add w27, w27, w24
bn.addc w26, w26, w31
/* [c_m, A[j]] = [w28, w24] = m[j+1]*u_i + a_tmp + c_m */
bn.add w24, w27, w28, FG1
/* store at w[4+j] = A[j-1]
This includes the reduction by 2^WLEN = 2^b in step 2.2 of HAC 14.36 */
bn.movr x10++, x13
bn.addc w28, w26, w31, FG1
/* Most significant limb of A is sum of the carry words of last loop cycle
A[N-1] = w24 <= w29 + w28 = c_xy + c_m */
bn.addc w24, w29, w28, FG1
bn.movr x10++, x13
/* restore pointers */
addi x16, x22, 0
li x8, 4
li x10, 4
/* This replaces Step 3 of HAC 14.36 and performs conditional constant-time
subtraction of the modulus from the output buffer. */
jal x1, cond_sub_to_reg
/* restore pointer again */
addi x16, x22, 0
/* restore pointer */
li x8, 4
ret
/**
* Constant time conditional bigint subtraction
*
* Returns C = A-x*B
* with A being a bigint of length 256..4096 bit
* B being a bigint of length 256..4096 bit
* C being a bigint of length 256..4096 bit
* x being a boolean value [0,1]
*
* Depending on state of FG1.C subtracts a bigint B located in dmem from
* another bigint A, located in the wide reg file and stores result C in dmem.
*
* Flags: When leaving this subroutine, flags of FG0 depend on a
* potentially discarded value and therefore are not usable after
* return. FG1 is not modified in this subroutine.
*
* @param[in] x16: dmem pointer to first limb of subtrahend (B)
* @param[in] x8: regfile pointer to first limb of minuend (input A)
* @param[in] x21: dmem pointer to first limb of result (C)
* @param[in] x30: N, number of limbs
* @param[in] FG1.C: subtraction condition, subtract if 1 (x)
* @param[in] x9: pointer to temp reg, must be set to 3
* @param[in] x11: pointer to temp reg, must be set to 2
* @param[in] FG0.C: needs to be set to 0
*
* clobbered registers: x8, x16, x21, w2, w3
* clobbered Flag Groups: FG0
*/
cond_sub_to_dmem:
/* iterate over all limbs for conditional limb-wise subtraction */
loop x30, 5
/* load limb of subtrahend (input B): w3 = dmem[x16+i] */
bn.lid x9, 0(x16++)
/* move limb from bignum bufer to w2 */
bn.movr x11, x8++
/* perform subtraction for a limb w3 = w2-1 */
bn.subb w3, w2, w3
/* conditionally select subtraction result or unmodified limb */
bn.sel w2, w3, w2, FG1.C
/* store selection result in dmem */
bn.sid x11, 0(x21++)
ret
/**
* Constant-time Montgomery modular multiply by one
*
* Returns: C = montmul(1,A) = A*R^(-1) mod M
*
* Routine for back-conversion from Montgomery domain.
* This implements the limb-by-limb interleaved Montgomery Modular
* Multiplication Algorithm, with one operand fixed to 1. This is only a
* wrapper around the main loop body. For algorithmic implementation details
* see the mont_loop subroutine.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* @param[in] x16: dmem pointer to first limb of modulus M
* @param[in] x17: dptr_m0d: dmem pointer to Montgomery Constant m0'
* @param[in] x19: dmem pointer to first limb of operand A
* @param[in] x21: dmem pointer to first limb of result C
* @param[in] x30: N, number of limbs
* @param[in] x31: N-1, number of limbs minus one
* @param[in] x8: pointer to temp reg, must be set to 4
* @param[in] x9: pointer to temp reg, must be set to 3
* @param[in] x10: pointer to temp reg, must be set to 4
* @param[in] x11: pointer to temp reg, must be set to 2
* @param[in] w31: all-zero
*
* clobbered registers: x6, x7, x8, x9, x10, x12, x13, x16, x19, x21
* w2. w3. w24, w25, w26, w27, w28, w29, w30
* w4 to w[4+N-1]
* clobbered Flag Groups: FG0, FG1
*/
montmul_mul1:
/* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */
bn.lid x9, 0(x17)
/* init regfile bigint buffer with zeros */
bn.mov w2, w31
loop x30, 1
bn.movr x10++, x11
/* w2=1 this is operand B */
bn.xor w2, w2, w2
bn.addi w2, w2, 1
/* save dmem pointers for operand A and modulus */
addi x6, x16, 0
addi x7, x19, 0
/* iterate over limbs of operand B */
loop x30, 4
/* restore dmem pointers for operand A and modulus */
addi x16, x6, 0
addi x19, x7, 0
/* Main loop body of Montgomery Multiplication algorithm */
/* 1[i]*A */
jal x1, mont_loop
/* all subsequent limbs of operand B are zero since B=1 */
bn.mov w2, w31
/* restore dmem pointers for operand A and modulus */
addi x16, x6, 0
addi x19, x7, 0
/* zeroize w2 and clear flags */
bn.sub w2, w2, w2, FG1
/* iterate over all limbs of bigint buffer for limbwise comparison of
buffer with the Modulus. After last loop cycle, FG1.C is set if bigint
in buffer is larger than Modulus */
loop x30, 3
/* load limb of limb of Modulus to w3 */
bn.lid x9, 0(x16++)
/* load limb from bigint buffer to w2 */
bn.movr x11, x8++
/* compare limb of flag with limb of Modulus */
bn.cmpb w3, w2, FG1
/* restore pointers to bigint buffer in regfile */
li x8, 4
li x10, 4
/* restore dmem pointers for operand A and modulus */
addi x16, x6, 0
addi x19, x7, 0
/* conditionally subtract Modulus from buffer and store result in
dmem[x21] to dmem[x21+N] */
jal x1, cond_sub_to_dmem
/* restore dmem pointers for operand A and modulus */
addi x16, x6, 0
addi x19, x7, 0
ret
/**
* Constant-time Montgomery Modular Multiplication
*
* Returns: C = montmul(A,B) = A*B*R^(-1) mod M
*
* This implements the limb-by-limb interleaved Montgomery Modular
* Multiplication Algorithm. This is only a wrapper around the main loop body.
* For algorithmic implementation details see the mont_loop subroutine.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* @param[in] x16: dptr_M, dmem pointer to first limb of modulus M
* @param[in] x17: dptr_m0d, dmem pointer to Montgomery Constant m0'
* @param[in] x19: dptr_a, dmem pointer to first limb of operand A
* @param[in] x20: dptr_b, dmem pointer to first limb of operand B
* @param[in] w31: all-zero
* @param[in] x30: N, number of limbs
* @param[in] x31: N-1, number of limbs minus one
* @param[in] x9: pointer to temp reg, must be set to 3
* @param[in] x10: pointer to temp reg, must be set to 4
* @param[in] x11: pointer to temp reg, must be set to 2
* @param[out] [w[4+N-1]:w4]: result C
*
* clobbered registers: x5, x6, x7, x8, x10, x12, x13, x16, x17, x19, x20
* w2, w3, w24 to w30, w4 to w[4+N-1]
* clobbered Flag Groups: FG0, FG1
*/
montmul:
/* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */
bn.lid x9, 0(x17)
/* init regfile bigint buffer with zeros */
bn.mov w2, w31
loop x30, 1
bn.movr x10++, x11
/* iterate over limbs of operand B */
loop x30, 8
/* load limb of operand b */
bn.lid x11, 0(x20++)
/* save some regs */
addi x5, x20, 0
addi x6, x16, 0
addi x7, x19, 0
/* Main loop body of Montgomery Multiplication algorithm */
jal x1, mont_loop
/* restore regs */
addi x20, x5, 0
addi x16, x6, 0
addi x19, x7, 0
/* restore pointers */
li x8, 4
li x10, 4
ret
/**
* Conditionally overwrite bigint in dmem
*
* Depending on state of FG0.C overwrites a bigint in dmem with one from
* a buffer in the wide reg file.
*
* Flags: Does not set any flags, does not use flags except FG0.C
*
* @param[in] x21: dptr, pointer to first limb of bigint in dmem
* @param[in] x8: rptr, pointer to first limb of bigint in regfile buffer
* @param[in] FG.C: selection condition, overwrite dmem when FG0.C==1
* @param[in] x30: number of limbs
* @param[in] x9: pointer to temp reg, must be set to 3
* @param[in] x11: pointer to temp reg, must be set to 2
*
* clobbered registers: x8, x21, w0, w2
* clobbered Flag Groups: none
*/
sel_sqr_or_sqrmul:
/* iterate over all limbs */
loop x30, 6
/* load limb from dmem */
bn.lid x9, 0(x21)
/* store limb to dmem */
bn.sid x11, 0(x21)
/* load limb from regfile buffer */
bn.movr x11, x8++
bn.mov w0, w2
/* conditional select: w2 = FG0.C?w[x8+i]:dmem[x21+i] */
bn.sel w2, w0, w3, C
/* store selected limb to dmem */
bn.sid x11, 0(x21++)
ret
/**
* Constant-time bigint modular exponentiation
*
* Returns: C = modexp(A,E) = A^E mod M
*
* This implements the square and multiply algorithm, i.e. for each bit of the
* exponent both the squared only and the squared with multiply results are
* computed but one result is discarded.
* Computation is carried out in the Montgomery domain, by using the montmul
* primitive.
* The squared Montgomery modulus RR and the Montgomery constant m0' have to
* be precomputed and provided at the appropriate locations in dmem.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* The base bignum A is expected in the input buffer, the exponent E in the
* exp buffer, the result C is written to the output buffer.
* Note, that the content of both, the input buffer and the exp buffer is
* modified during execution.
*
* @param[in] x2: dptr_c, dmem pointer to buffer for output C
* @param[in] x14: dptr_a, dmem pointer to first limb of input A
* @param[in] x15: dptr_e, dmem pointer to first limb of exponent E
* @param[in] x16: dptr_M, dmem pointer to first limb of modulus M
* @param[in] x17: dptr_m0d, dmem pointer to first limb of m0'
* @param[in] x18: dptr_RR, dmem pointer to first limb of RR
* @param[in] x30: N, number of limbs per bignum
* @param[in] w31: all-zero
* @param[out] dmem[dptr_c:dptr_c+N*32] C, A^E mod M
*
* clobbered registers: x3 to x13, x16 to x31
* w0 to w3, w24 to w30
* w4 to w[4+N-1]
* clobbered Flag Groups: FG0, FG1
*/
modexp:
/* prepare pointers to temp regs */
li x8, 4
li x9, 3
li x10, 4
li x11, 2
/* Compute (N-1).
x31 <= x30 - 1 = N - 1 */
addi x31, x30, -1
/* Convert input to montgomery domain.
dmem[dptr_a] <= montmul(A,RR) = A*R mod M */
addi x19, x14, 0
addi x20, x18, 0
addi x21, x14, 0
jal x1, montmul
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* zeroize w2 and reset flags */
bn.sub w2, w2, w2
/* initialize the output buffer with -M */
addi x3, x16, 0
addi x21, x2, 0
loop x30, 3
/* load limb from modulus */
bn.lid x11, 0(x3++)
/* subtract limb from 0 */
bn.subb w2, w31, w2
/* store limb in dmem */
bn.sid x11, 0(x21++)
/* compute bit length of current bigint size */
slli x24, x30, 8
/* iterate over all bits of bigint */
loop x24, 20
/* square: out = montmul(out,out) */
addi x19, x2, 0
addi x20, x2, 0
addi x21, x2, 0
jal x1, montmul
/* Store result in dmem starting at dmem[dptr_c] */
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* multiply: out = montmul(in,out) */
addi x19, x14, 0
addi x20, x2, 0
addi x21, x2, 0
jal x1, montmul
/* w2 <= w2 << 1 */
bn.add w2, w2, w2
/* the loop performs a 1-bit left shift of the exponent. Last MSB moves
to FG0.C, such that it can be used for selection */
addi x20, x15, 0
loop x30, 3
bn.lid x11, 0(x20)
/* w2 <= w2 << 1 */
bn.addc w2, w2, w2
bn.sid x11, 0(x20++)
/* select squared or squared+multiplied result */
addi x21, x2, 0
jal x1, sel_sqr_or_sqrmul
nop
/* convert back from montgomery domain */
/* out = montmul(out,1) = out/R mod M */
addi x19, x2, 0
addi x21, x2, 0
jal x1, montmul_mul1
ret
/**
* Bigint modular exponentiation with fixed exponent of 65537
*
* Returns: C = modexp(A,65537) = A^65537 mod M
*
* This implements the square and multiply algorithm for the fixed exponent
* of E=65537. Note that this implementation (in contrast to modexp) runs the
* multiplication step only for bits being actually set in the exponent.
* Since the exponent is fixed, this is inherently constant-time.
*
* The squared Montgomery modulus RR and the Montgomery constant m0' have to
* be precomputed and provided at the appropriate locations in dmem.
*
* Flags: The states of both FG0 and FG1 depend on intermediate values and are
* not usable after return.
*
* The base bignum A is expected in the input buffer, the result C is written
* to the output buffer. Note, that the content of the input buffer is
* modified during execution.
*
* @param[in] x2: dptr_c, dmem pointer to buffer for output C
* @param[in] x14: dptr_a, dmem pointer to first linb of input A
* @param[in] x16: dptr_M, dmem pointer to first limb of modulus M
* @param[in] x17: dptr_m0d, dmem pointer to Mongtgomery constant m0'
* @param[in] x18: dptr_RR, dmem pointer to Montgmery constant RR
* @param[in] x30: N, number of limbs per bignum
* @param[in] w31: all-zero
* @param[out] dmem[dptr_c:dptr_c+N*32] C, A^65537 mod M
*
* clobbered registers: x3 to x13, x16 to x31
* w0 to w3, w24 to w30
* w4 to w[4+N-1]
* clobbered Flag Groups: FG0, FG1
*/
modexp_65537:
/* prepare pointers to temp regs */
li x8, 4
li x9, 3
li x10, 4
li x11, 2
/* Compute (N-1).
x31 <= x30 - 1 = N - 1 */
addi x31, x30, -1
/* convert to montgomery domain montmul(A,RR)
in = montmul(A,RR) montmul(A,RR) = C*R mod M */
addi x19, x14, 0
addi x20, x18, 0
addi x21, x14, 0
jal x1, montmul
/* Store result in dmem starting at dmem[dptr_a] */
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* pointer to out buffer */
addi x21, x2, 0
/* zeroize w2 and reset flags */
bn.sub w2, w2, w2
/* pointer to modulus */
addi x3, x16, 0
/* this loop initializes the output buffer with -M */
loop x30, 3
/* load limb from modulus */
bn.lid x11, 0(x3++)
/* subtract limb from 0 */
bn.subb w2, w31, w2
/* store limb in dmem */
bn.sid x11, 0(x21++)
/* TODO: Is this squaring necessary? */
/* 65537 = 0b10000000000000001
^ sqr + mult
out = montmul(out,out) */
addi x19, x2, 0
addi x20, x2, 0
jal x1, montmul
/* Store result in dmem starting at dmem[dptr_c] */
addi x21, x2, 0
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* out = montmul(in,out) */
addi x19, x14, 0
addi x20, x2, 0
jal x1, montmul
/* store multiplication result in output buffer */
addi x21, x2, 0
li x8, 4
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* 65537 = 0b10000000000000001
^<< 16 x sqr >>^ */
loopi 16, 8
/* square: out = montmul(out, out) */
addi x19, x2, 0
addi x20, x2, 0
jal x1, montmul
/* Store result in dmem starting at dmem[dptr_c] */
addi x21, x2, 0
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
nop
/* 65537 = 0b10000000000000001
mult ^
out = montmul(in,out) */
addi x19, x14, 0
addi x20, x2, 0
jal x1, montmul
/* store multiplication result in output buffer */
addi x21, x2, 0
li x8, 4
loop x30, 2
bn.sid x8, 0(x21++)
addi x8, x8, 1
/* convert back from montgomery domain */
/* out = montmul(out,1) = out/R mod M */
addi x19, x2, 0
addi x21, x2, 0
jal x1, montmul_mul1
ret
/**
* Externally callable wrapper for computing Montgomery parameters
*
* Computes:
* - Montgomery Constant m0'
* - Squared Montgomery modulus RR mod N
*
* Needs to be executed once per constant Modulus.
*
* @param[in] x16: dptr_M, dmem pointer to first limb of modulus M
* @param[in] x17: dptr_m0d, dmem pointer to buffer for m0'
* @param[in] x18: dptr_RR, dmem pointer to buffer for RR
* @param[in] x30: N, number of limbs per bignum
* @param[in] w31: all-zero
* @param[out] [dmem[dptr_m0d+31]:dmem[dptr_m0d]] computed m0'
* @param[out] [dmem[dptr_RR+N*32-1]:dmem[dptr_RR]] computed RR
*/
modload:
/* load lowest limb of modulus to w28 */
li x8, 28
bn.lid x8, 0(x16)
/* Compute Montgomery constant */
jal x1, m0inv
/* Store Montgomery constant in dmem */
li x9, 29
bn.sid x9, 0(x17)
/* Compute square of Montgomery modulus */
jal x1, compute_rr
ret