sw/otbn/crypto/x25519.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */

 /**
  * This library contains code for X25519, for use in elliptic-curve Diffie
  * Hellman key exchange.
  *
  * Helpful references:
  * RFC: https://datatracker.ietf.org/doc/html/rfc7748
  * Academic paper: https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf
  */

 /**
  * Top-level X25519 function, as defined in RFC 7748.
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  w8: enc(k), encoded scalar (256-bit random number)
  * @param[in]  w9: enc(u), encoded Montgomery u-coordinate (256 bits)
  * @param[out] w22: result, X25519(k, u) as an encoded u-coordinate
  *
  * clobbered registers: w2 to w24
  * clobbered flag groups: FG0
  */
 .globl X25519
 X25519:
   /* Prepare all-zero register. */
   bn.xor   w31, w31, w31
   /* Prepare constant for field operations.
      w19 <= 19  */
   bn.addi  w19, w31, 19

   /* Load modulus from DMEM.
        MOD <= dmem[modulus25519] = 2^255 - 19 = p */
   li      x2, 2
   la      x3, modulus25519
   bn.lid  x2, 0(x3)
   bn.wsrw 0x0, w2

   /* Decode scalar. From RFC 7748, section 5:

    "For X25519, in order to decode 32 random bytes as an integer scalar, set
     the three least significant bits of the first byte and the most significant bit
     of the last to zero, set the second most significant bit of the last byte to 1
     and, finally, decode as little-endian." */

   /* w8 <= (w8 >> 3) = enc(k) >> 3 */
   bn.rshi  w8, w31, w8 >> 3
   /* w8 <= w8 << 5 = ((enc(k) >> 3) << 5) mod 2^256 */
   bn.rshi  w8, w8, w31 >> 251
   /* w8 <= 2^254 + (w8 >> 2) = ((enc(k) >> 3) << 3) mod 2^254 + 2^254 = k*/
   bn.addi  w7, w31, 1
   bn.rshi  w8, w7, w8 >> 2

   /* Decode point u-coordinate. As specified in RFC 7748, section 5, we must
      zero the most significant bit and accept non-canonical values (i.e. cases
      where p <= u) as if they had been reduced modulo p. */

   /* w7 <= ~w31 = 2^256 - 1 */
   bn.not   w7, w31
   /* w7 <= w7 >> 1 = 2^255 - 1 */
   bn.rshi  w7, w31, w7 >> 1
   /* w9 <= w9 & w7 = enc(u) mod 2^255 */
   bn.and   w9, w9, w7
   /* w9 <= w9 mod p <= (enc(u) mod 2^255) mod p = u */
   bn.addm  w9, w9, w31

   /* Note that, in contrast to Ed25519, the given point does not have to be
      validated; see the original Curve25519 paper for discussion (search
      for "small-subgroup attacks" and "free key validation").
      https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf */

   /* Perform scalar multiplication.
        w22 <= scalar_mult(k, u) = X25519(k, u) */
   jal     x1, scalar_mult

   /* Since the scalar multiplication routine has already reduced its result
      modulo p and OTBN is little-endian, encoding the result is a no-op. */

   ret

 /**
  * Elliptic-curve scalar multiplication for Curve25519.
  *
  * Returns B = kA.
  *
  * Multiplies a point on the curve (A) with a scalar (k). Uses the Montgomery
  * ladder algorithm as in RFC 7748. For the Montgomery ladder, we only need to
  * consider the u-coordinate of curve points.
  *
  * See RFC 7748, section 5:
  *   https://datatracker.ietf.org/doc/html/rfc7748#section-5
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  w8: k, 255-bit scalar value
  * @param[in]  w9: u, Montgomery u-coordinate of point A (k < p)
  * @param[in]  w19: constant, w19 = 19
  * @param[in]  w31, all-zero
  * @param[in]  MOD: p, modulus = 2^255 - 19
  * @param[out] w22: result, Montgomery u-coordinate of point kA
  *
  * clobbered registers: w2 to w7, w10 to w18, w20 to w24
  * clobbered flag groups: FG0
  */
 scalar_mult:

   /* Load the constant a24:
        w24 <= dmem[a24] = 121665 */
   li      x2, 24
   la      x3, a24
   bn.lid  x2, 0(x3)

   /* Set initial values. From RFC 7748:

      x_2 = 1
      z_2 = 0
      x_3 = u
      z_3 = 1
      swap = 0
  */

   /* x_2 = w10 <= 1 */
   bn.addi  w10, w31, 1
   /* z_2 = w12 <= 0 */
   bn.mov   w12, w31
   /* x_3 = w11 <= u */
   bn.mov   w11, w9
   /* z_3 = w13 <= 1 */
   bn.addi  w13, w31, 1
   /* swap = w15 <= 0 */
   bn.mov   w15, w31

   /* Initialize w4 for loop.
        w14 <= (k << 1) mod 2^256 */
   bn.rshi  w14, w8, w31 >> 255

   /* Main loop. From RFC 7748 (with `ladderstep` factored into a separate step):

      For t = bits-1 down to 0:
          k_t = (k >> t) & 1
          swap ^= k_t
          (x_2, x_3) = cswap(swap, x_2, x_3)
          (z_2, z_3) = cswap(swap, z_2, z_3)
          swap = k_t

          ladderstep(u, x_2, x_3, z_2, z_3)

      Loop invariants:
        w14 = (k << (255-t)) mod 2^256
        w9 = u

      Loop temporary registers: w6, w7
   */
   loopi   255, 12
     /* w7 <= w14 >> 255 = (k << (255-t)) mod 2^256 >> 255 = k[t] = k_t */
     bn.rshi  w7, w31, w14 >> 255
     /*  w14 <= w14 << 1 = (k << (255-(t-1)) mod 2^256 */
     bn.rshi  w14, w14, w31 >> 255
     /* swap = w15 = w15 ^ w7 = swap ^ k_t */
     bn.xor   w15, w15, w7

     /* Note: The xor above set FG0.Z if and only if swap ^ k_t is zero. */

     /* Conditionally swap x_2 and x_3. */

     /* w6 <= if swap =? 0 then x_2 else x_3 */
     bn.sel  w6, w10, w11, FG0.Z
     /* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
     bn.sel  w11, w11, w10, FG0.Z
     /* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
     bn.mov  w10, w6

     /* Conditionally swap z_2 and z_3. */

     /* w6 <= if swap =? 0 then z_2 else z_3 */
     bn.sel  w6, w12, w13, FG0.Z
     /* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
     bn.sel  w13, w13, w12, FG0.Z
     /* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
     bn.mov  w12, w6

     /* swap = w15 <= w7 = k_t */
     bn.mov  w15, w7

     jal     x1, ladderstep
     nop

   /* Loop done; final steps. From RFC 7748:

      (x_2, x_3) = cswap(swap, x_2, x_3)
      (z_2, z_3) = cswap(swap, z_2, z_3)
      Return x_2 * (z_2^(p - 2))
   */

   /* Set the FG0.Z flag if swap (w15) is 0. */
   bn.add  w15, w15, w31

   /* Conditionally swap x_2 and x_3. */

   /* w6 <= if swap =? 0 then x_2 else x_3 */
   bn.sel  w6, w10, w11, FG0.Z
   /* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
   bn.sel  w11, w11, w10, FG0.Z
   /* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
   bn.mov  w10, w6

   /* Conditionally swap z_2 and z_3. */

   /* w6 <= if swap =? 0 then z_2 else z_3 */
   bn.sel  w6, w12, w13, FG0.Z
   /* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
   bn.sel  w13, w13, w12, FG0.Z
   /* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
   bn.mov  w12, w6

   /* w22 <= fe_inv(w12) = z_2^(p-2) */
   bn.mov  w16, w12
   jal     x1, fe_inv
   /* w22 <= w22 * w10 = (z_2^(p-2)) * x_2 */
   bn.mov  w23, w10
   jal     x1, fe_mul

   ret

 /**
  * Performs the core arithmetic step of the Montgomery ladder.
  *
  * This subroutine is intended to be used as part of a Montgomery ladder
  * implementation. It performs the core arithmetic in the main loop, i.e. the
  * following steps (from RFC 7748, section 5):
  *   A = x_2 + z_2
  *   AA = A^2
  *   B = x_2 - z_2
  *   BB = B^2
  *   E = AA - BB
  *   C = x_3 + z_3
  *   D = x_3 - z_3
  *   DA = D * A
  *   CB = C * B
  *   x_3 = (DA + CB)^2
  *   z_3 = x_1 * (DA - CB)^2
  *   x_2 = AA * BB
  *   z_2 = E * (AA + a24 * E)
  *
  * All arithmetic (+, -, *, ^) above is in the finite field GF(2^255-19).
  *
  * TODO: investigate alternative laddersteps, such as the one in
  * curve25519-donna, which may be faster.
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] w9: x_1, Montgomery u-coordinate of point A (k < p)
  * @param[in] w19: constant, w19 = 19
  * @param[in] w24: constant a24 = 121665
  * @param[in] w31: all-zero
  * @param[in] MOD: p, modulus = 2^255 - 19
  * @param[in,out] w10:  x_2
  * @param[in,out] w11:  x_3
  * @param[in,out] w12:  z_2
  * @param[in,out] w13:  z_3
  *
  * clobbered registers: w2 to w5, w10 to w13, w17, w18, w20 to w23
  * clobbered flag groups: FG0
  */
 ladderstep:
   /* First, compute the new x_2 and z_2:
        A = x_2 + z_2
        AA = A^2
        B = x_2 - z_2
        BB = B^2
        E = AA - BB
        x_2 = AA * BB
        z_2 = E * (AA + a24 * E) */

   /* w2 <= w10 + w12 = x_2 + z_2 = A */
   bn.addm  w2, w10, w12
   /* w3 <= w2^2 = A^2 = AA */
   bn.mov   w22, w2
   jal      x1, fe_square
   bn.mov   w3, w22
   /* w4 <= w10 - w12 = x_2 - z_2 = B */
   bn.subm  w4, w10, w12
   /* w22 <= w4^2 = B^2 = BB */
   bn.mov   w22, w4
   jal      x1, fe_square
   /* w5 <= w3 - w22 = AA - BB = E */
   bn.subm  w5, w3, w22
   /* x_2 = w10 <= w22 * w3 = BB * AA */
   bn.mov   w23, w3
   jal      x1, fe_mul
   bn.mov   w10, w22
   /* w23 <= w5 = E */
   bn.mov   w23, w5
   /* TODO: create faster, specialized fe_mul for a24 */
   /* w22 <= w24 * w23 = a24 * E */
   bn.mov   w22, w24
   jal      x1, fe_mul
   /* w22 <= w3 + w22 = AA + (a24 * E) */
   bn.add   w22, w3, w22
   /* z_2 = w12 <= w22 * w23 = (AA + a24 * E) * E */
   jal      x1, fe_mul
   bn.mov   w12, w22

   /* Now, compute the new x_3 and z_3, using A and B from the computation
      above:
        C = x_3 + z_3
        D = x_3 - z_3
        DA = D * A
        CB = C * B
        x_3 = (DA + CB)^2
        z_3 = x_1 * (DA - CB)^2 */

   /* w22 <= w11 + w13 = x_3 + z_3 = C */
   bn.addm  w22, w11, w13
   /* w4 <= w22 * w4 = C * B = CB */
   bn.mov   w23, w4
   jal      x1, fe_mul
   bn.mov   w4, w22
   /* w22 <= w11 - w13 = x_3 - z_3 = D */
   bn.subm  w22, w11, w13
   /* w2 <= w22 * w2 = D * A = DA */
   bn.mov   w23, w2
   jal      x1, fe_mul
   bn.mov   w2, w22
   /* w22 <= w2 + w4 <= DA + CB */
   bn.addm  w22, w2, w4
   /* x_3 = w11 <= w22^2 = (DA + CB)^2 */
   jal      x1, fe_square
   bn.mov   w11, w22
   /* w22 <= w2 - w4 <= DA - CB */
   bn.subm  w22, w2, w4
   /* w22 <= w22^2 = (DA - CB)^2 */
   jal      x1, fe_square
   /* z_3 = w13 <= w22 * w9 = (DA - CB)^2 * x_1 */
   bn.mov   w23, w9
   jal      x1, fe_mul
   bn.mov   w13, w22

   ret

 .data

 /* Modulus p = 2^255 - 19. */
 .balign 32
 modulus25519:
   .word 0xffffffed
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0x7fffffff

 /* Constant (a - 2) / 4 for Curve25519 = 121665 (see RFC 7748, section 5). */
 .balign 32
 a24:
 .word 0x0001db41
 .zero 28
	/* Copyright lowRISC contributors. */
	/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
	/* SPDX-License-Identifier: Apache-2.0 */

	/**
	* This library contains code for X25519, for use in elliptic-curve Diffie
	* Hellman key exchange.
	*
	* Helpful references:
	* RFC: https://datatracker.ietf.org/doc/html/rfc7748
	* Academic paper: https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf
	*/

	/**
	* Top-level X25519 function, as defined in RFC 7748.
	*
	* This routine runs in constant time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] w8: enc(k), encoded scalar (256-bit random number)
	* @param[in] w9: enc(u), encoded Montgomery u-coordinate (256 bits)
	* @param[out] w22: result, X25519(k, u) as an encoded u-coordinate
	*
	* clobbered registers: w2 to w24
	* clobbered flag groups: FG0
	*/
	.globl X25519
	X25519:
	/* Prepare all-zero register. */
	bn.xor w31, w31, w31
	/* Prepare constant for field operations.
	w19 <= 19 */
	bn.addi w19, w31, 19

	/* Load modulus from DMEM.
	MOD <= dmem[modulus25519] = 2^255 - 19 = p */
	li x2, 2
	la x3, modulus25519
	bn.lid x2, 0(x3)
	bn.wsrw 0x0, w2

	/* Decode scalar. From RFC 7748, section 5:

	"For X25519, in order to decode 32 random bytes as an integer scalar, set
	the three least significant bits of the first byte and the most significant bit
	of the last to zero, set the second most significant bit of the last byte to 1
	and, finally, decode as little-endian." */

	/* w8 <= (w8 >> 3) = enc(k) >> 3 */
	bn.rshi w8, w31, w8 >> 3
	/* w8 <= w8 << 5 = ((enc(k) >> 3) << 5) mod 2^256 */
	bn.rshi w8, w8, w31 >> 251
	/* w8 <= 2^254 + (w8 >> 2) = ((enc(k) >> 3) << 3) mod 2^254 + 2^254 = k*/
	bn.addi w7, w31, 1
	bn.rshi w8, w7, w8 >> 2

	/* Decode point u-coordinate. As specified in RFC 7748, section 5, we must
	zero the most significant bit and accept non-canonical values (i.e. cases
	where p <= u) as if they had been reduced modulo p. */

	/* w7 <= ~w31 = 2^256 - 1 */
	bn.not w7, w31
	/* w7 <= w7 >> 1 = 2^255 - 1 */
	bn.rshi w7, w31, w7 >> 1
	/* w9 <= w9 & w7 = enc(u) mod 2^255 */
	bn.and w9, w9, w7
	/* w9 <= w9 mod p <= (enc(u) mod 2^255) mod p = u */
	bn.addm w9, w9, w31

	/* Note that, in contrast to Ed25519, the given point does not have to be
	validated; see the original Curve25519 paper for discussion (search
	for "small-subgroup attacks" and "free key validation").
	https://www.iacr.org/cryptodb/archive/2006/PKC/3351/3351.pdf */

	/* Perform scalar multiplication.
	w22 <= scalar_mult(k, u) = X25519(k, u) */
	jal x1, scalar_mult

	/* Since the scalar multiplication routine has already reduced its result
	modulo p and OTBN is little-endian, encoding the result is a no-op. */

	ret

	/**
	* Elliptic-curve scalar multiplication for Curve25519.
	*
	* Returns B = kA.
	*
	* Multiplies a point on the curve (A) with a scalar (k). Uses the Montgomery
	* ladder algorithm as in RFC 7748. For the Montgomery ladder, we only need to
	* consider the u-coordinate of curve points.
	*
	* See RFC 7748, section 5:
	* https://datatracker.ietf.org/doc/html/rfc7748#section-5
	*
	* This routine runs in constant time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] w8: k, 255-bit scalar value
	* @param[in] w9: u, Montgomery u-coordinate of point A (k < p)
	* @param[in] w19: constant, w19 = 19
	* @param[in] w31, all-zero
	* @param[in] MOD: p, modulus = 2^255 - 19
	* @param[out] w22: result, Montgomery u-coordinate of point kA
	*
	* clobbered registers: w2 to w7, w10 to w18, w20 to w24
	* clobbered flag groups: FG0
	*/
	scalar_mult:

	/* Load the constant a24:
	w24 <= dmem[a24] = 121665 */
	li x2, 24
	la x3, a24
	bn.lid x2, 0(x3)

	/* Set initial values. From RFC 7748:

	x_2 = 1
	z_2 = 0
	x_3 = u
	z_3 = 1
	swap = 0
	*/

	/* x_2 = w10 <= 1 */
	bn.addi w10, w31, 1
	/* z_2 = w12 <= 0 */
	bn.mov w12, w31
	/* x_3 = w11 <= u */
	bn.mov w11, w9
	/* z_3 = w13 <= 1 */
	bn.addi w13, w31, 1
	/* swap = w15 <= 0 */
	bn.mov w15, w31

	/* Initialize w4 for loop.
	w14 <= (k << 1) mod 2^256 */
	bn.rshi w14, w8, w31 >> 255

	/* Main loop. From RFC 7748 (with `ladderstep` factored into a separate step):

	For t = bits-1 down to 0:
	k_t = (k >> t) & 1
	swap ^= k_t
	(x_2, x_3) = cswap(swap, x_2, x_3)
	(z_2, z_3) = cswap(swap, z_2, z_3)
	swap = k_t

	ladderstep(u, x_2, x_3, z_2, z_3)

	Loop invariants:
	w14 = (k << (255-t)) mod 2^256
	w9 = u

	Loop temporary registers: w6, w7
	*/
	loopi 255, 12
	/* w7 <= w14 >> 255 = (k << (255-t)) mod 2^256 >> 255 = k[t] = k_t */
	bn.rshi w7, w31, w14 >> 255
	/* w14 <= w14 << 1 = (k << (255-(t-1)) mod 2^256 */
	bn.rshi w14, w14, w31 >> 255
	/* swap = w15 = w15 ^ w7 = swap ^ k_t */
	bn.xor w15, w15, w7

	/* Note: The xor above set FG0.Z if and only if swap ^ k_t is zero. */

	/* Conditionally swap x_2 and x_3. */

	/* w6 <= if swap =? 0 then x_2 else x_3 */
	bn.sel w6, w10, w11, FG0.Z
	/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
	bn.sel w11, w11, w10, FG0.Z
	/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
	bn.mov w10, w6

	/* Conditionally swap z_2 and z_3. */

	/* w6 <= if swap =? 0 then z_2 else z_3 */
	bn.sel w6, w12, w13, FG0.Z
	/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
	bn.sel w13, w13, w12, FG0.Z
	/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
	bn.mov w12, w6

	/* swap = w15 <= w7 = k_t */
	bn.mov w15, w7

	jal x1, ladderstep
	nop

	/* Loop done; final steps. From RFC 7748:

	(x_2, x_3) = cswap(swap, x_2, x_3)
	(z_2, z_3) = cswap(swap, z_2, z_3)
	Return x_2 * (z_2^(p - 2))
	*/

	/* Set the FG0.Z flag if swap (w15) is 0. */
	bn.add w15, w15, w31

	/* Conditionally swap x_2 and x_3. */

	/* w6 <= if swap =? 0 then x_2 else x_3 */
	bn.sel w6, w10, w11, FG0.Z
	/* x_3 = w11 <= if swap =? 0 then x_3 else x_2 */
	bn.sel w11, w11, w10, FG0.Z
	/* x_2 = w10 <= w6 = if swap =? 0 then x_2 else x_3 */
	bn.mov w10, w6

	/* Conditionally swap z_2 and z_3. */

	/* w6 <= if swap =? 0 then z_2 else z_3 */
	bn.sel w6, w12, w13, FG0.Z
	/* z_3 = w13 <= if swap =? 0 then z_3 else z_2 */
	bn.sel w13, w13, w12, FG0.Z
	/* z_2 = w12 <= w6 = if swap =? 0 then z_2 else z_3 */
	bn.mov w12, w6

	/* w22 <= fe_inv(w12) = z_2^(p-2) */
	bn.mov w16, w12
	jal x1, fe_inv
	/* w22 <= w22 * w10 = (z_2^(p-2)) * x_2 */
	bn.mov w23, w10
	jal x1, fe_mul

	ret

	/**
	* Performs the core arithmetic step of the Montgomery ladder.
	*
	* This subroutine is intended to be used as part of a Montgomery ladder
	* implementation. It performs the core arithmetic in the main loop, i.e. the
	* following steps (from RFC 7748, section 5):
	* A = x_2 + z_2
	* AA = A^2
	* B = x_2 - z_2
	* BB = B^2
	* E = AA - BB
	* C = x_3 + z_3
	* D = x_3 - z_3
	* DA = D * A
	* CB = C * B
	* x_3 = (DA + CB)^2
	* z_3 = x_1 * (DA - CB)^2
	* x_2 = AA * BB
	* z_2 = E * (AA + a24 * E)
	*
	* All arithmetic (+, -, *, ^) above is in the finite field GF(2^255-19).
	*
	* TODO: investigate alternative laddersteps, such as the one in
	* curve25519-donna, which may be faster.
	*
	* This routine runs in constant time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] w9: x_1, Montgomery u-coordinate of point A (k < p)
	* @param[in] w19: constant, w19 = 19
	* @param[in] w24: constant a24 = 121665
	* @param[in] w31: all-zero
	* @param[in] MOD: p, modulus = 2^255 - 19
	* @param[in,out] w10: x_2
	* @param[in,out] w11: x_3
	* @param[in,out] w12: z_2
	* @param[in,out] w13: z_3
	*
	* clobbered registers: w2 to w5, w10 to w13, w17, w18, w20 to w23
	* clobbered flag groups: FG0
	*/
	ladderstep:
	/* First, compute the new x_2 and z_2:
	A = x_2 + z_2
	AA = A^2
	B = x_2 - z_2
	BB = B^2
	E = AA - BB
	x_2 = AA * BB
	z_2 = E * (AA + a24 * E) */

	/* w2 <= w10 + w12 = x_2 + z_2 = A */
	bn.addm w2, w10, w12
	/* w3 <= w2^2 = A^2 = AA */
	bn.mov w22, w2
	jal x1, fe_square
	bn.mov w3, w22
	/* w4 <= w10 - w12 = x_2 - z_2 = B */
	bn.subm w4, w10, w12
	/* w22 <= w4^2 = B^2 = BB */
	bn.mov w22, w4
	jal x1, fe_square
	/* w5 <= w3 - w22 = AA - BB = E */
	bn.subm w5, w3, w22
	/* x_2 = w10 <= w22 * w3 = BB * AA */
	bn.mov w23, w3
	jal x1, fe_mul
	bn.mov w10, w22
	/* w23 <= w5 = E */
	bn.mov w23, w5
	/* TODO: create faster, specialized fe_mul for a24 */
	/* w22 <= w24 * w23 = a24 * E */
	bn.mov w22, w24
	jal x1, fe_mul
	/* w22 <= w3 + w22 = AA + (a24 * E) */
	bn.add w22, w3, w22
	/* z_2 = w12 <= w22 * w23 = (AA + a24 * E) * E */
	jal x1, fe_mul
	bn.mov w12, w22

	/* Now, compute the new x_3 and z_3, using A and B from the computation
	above:
	C = x_3 + z_3
	D = x_3 - z_3
	DA = D * A
	CB = C * B
	x_3 = (DA + CB)^2
	z_3 = x_1 * (DA - CB)^2 */

	/* w22 <= w11 + w13 = x_3 + z_3 = C */
	bn.addm w22, w11, w13
	/* w4 <= w22 * w4 = C * B = CB */
	bn.mov w23, w4
	jal x1, fe_mul
	bn.mov w4, w22
	/* w22 <= w11 - w13 = x_3 - z_3 = D */
	bn.subm w22, w11, w13
	/* w2 <= w22 * w2 = D * A = DA */
	bn.mov w23, w2
	jal x1, fe_mul
	bn.mov w2, w22
	/* w22 <= w2 + w4 <= DA + CB */
	bn.addm w22, w2, w4
	/* x_3 = w11 <= w22^2 = (DA + CB)^2 */
	jal x1, fe_square
	bn.mov w11, w22
	/* w22 <= w2 - w4 <= DA - CB */
	bn.subm w22, w2, w4
	/* w22 <= w22^2 = (DA - CB)^2 */
	jal x1, fe_square
	/* z_3 = w13 <= w22 * w9 = (DA - CB)^2 * x_1 */
	bn.mov w23, w9
	jal x1, fe_mul
	bn.mov w13, w22

	ret

	.data

	/* Modulus p = 2^255 - 19. */
	.balign 32
	modulus25519:
	.word 0xffffffed
	.word 0xffffffff
	.word 0xffffffff
	.word 0xffffffff
	.word 0xffffffff
	.word 0xffffffff
	.word 0xffffffff
	.word 0x7fffffff

	/* Constant (a - 2) / 4 for Curve25519 = 121665 (see RFC 7748, section 5). */
	.balign 32
	a24:
	.word 0x0001db41
	.zero 28