| /* Copyright lowRISC contributors. |
| * Copyright 2016 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE.dcrypto file. |
| */ |
| |
| /** |
| * This library implements hash computation as specified in FIPS PUB 180-4 |
| * "Secure Hash Standard (SHS)". |
| * |
| * Terminology within the comments in this library is based (as much as |
| * possible) on the terminology of FIPS 180-4. |
| * |
| * The functions named with the greek sigma have been renamed: |
| * - sigma_lowercase(x) -> s(x) |
| * - sigma_uppercase(x) -> S(x) |
| * |
| * Upercase W_i denotes the i-th word from the message schedule. |
| * Uppercase M_i denotes the i_th message word of the current chunk. |
| * Uppercase K_i denotes the i-th round constant. |
| */ |
| |
| |
| .section .text |
| |
| |
| /** |
| * Compute SHA-512 hash |
| * |
| * Updates the SHA-512 state for n subsequent 1024-bit chunks of a |
| * pre-formatted message. |
| * |
| * The message is expected in dmem in a pre-processed format: |
| * - The message has been padded according to the SHA-512 standard. |
| * - The padded message has been broken down into 64-bit sized big-endian |
| * words. I.e. for a message stored at dmem address a, the expected |
| * formating for the first 16 message bytes is as follows |
| * (where mn denotes the n-th message byte): |
| * |a+15|a+14|a+13|a+12|a+11|a+10|a+9|a+8|a+7|a+6|a+5|a+4|a+3|a+2|a+1|a+0| |
| * | m8| m9| m10| m11| m12| m13|m14|m15| m0| m1| m2| m3| m4| m5| m6| m7| |
| * |
| * The state variables H[0] to H[7] are expected in dmem in 8 subsequent |
| * 256-bit memory cells, where each state variable occupies the lower 64 bit |
| * of such a cell. For the state stored at dmem address a the expected format |
| * is as follows: |
| * dmem[a + ][63:0]: H[0] |
| * dmem[a + 32][63:0]: H[1] |
| * dmem[a + 64][63:0]: H[2] |
| * dmem[a + 96][63:0]: H[3] |
| * dmem[a + 128][63:0]: H[4] |
| * dmem[a + 160][63:0]: H[5] |
| * dmem[a + 192][63:0]: H[6] |
| * dmem[a + 224][63:0]: H[7] |
| * The upper 192 bits of each cell are clobbered during the execution of the |
| * algorithm but their contents are irrelevant. |
| * |
| * The routine makes use of a 640 byte sized scratchpad in dmem for the message |
| * schedule. |
| * |
| * This routine runs in constant time. |
| * |
| * Flags: Flags have no meaning beyond the scope of this subroutine. |
| * |
| * @param[in] dmem[n_chunks]: number of chunks to process in a single go |
| * @param[in] dmem[dptr_state]: dmem location with state ][63:0]: H[0] |
| * @param[in] dmem[dptr_msg]: Pointer to memory location containing the pre- |
| * formatted message chunks. |
| * |
| * clobbered registers: w0 to w7, w10, w11, w15 to w26, w30, w31 |
| * x1, x2, x10, x11 to x17, x20 |
| * clobbered flag groups: FG0 |
| */ |
| .globl sha512 |
| sha512: |
| |
| /* w31 = 0; w30 = 1111...1111 */ |
| bn.xor w31, w31, w31 |
| bn.subi w30, w31, 1 |
| |
| /* read number of 1024-bit chunks from dmem */ |
| la x20, n_chunks |
| lw x20, 0(x20) |
| |
| /* read pointer to state variables from dmem */ |
| la x17, dptr_state |
| lw x17, 0(x17) |
| |
| /* read pointer to message buffer from dmem */ |
| la x14, dptr_msg |
| lw x14, 0(x14) |
| |
| /* init reg pointers */ |
| li x10, 10 |
| li x11, 11 |
| li x19, 22 |
| |
| /* init pointer to scratchpad for message schedule */ |
| la x13, W |
| |
| /* init pointer to round constants */ |
| la x16, K |
| |
| /* one iteration per chunk */ |
| loop x20, 369 |
| |
| /* reset pointer to message schedule */ |
| addi x12, x13, 0 |
| |
| /* Expand 1024 bit data chunk to full message schedule (W_0 ... W_79) |
| The 80 64-bit words of the message schedule are kept in dmem |
| scatchpad (20 256-bit cells). */ |
| |
| /* The message schedule's 16 lower words (W_0 to W_15) are set equal to the |
| 16 words of the message chunk (M_0 to M_15). |
| |
| The WDRs w19 to w22 are used as a sliding window over 16 words of the |
| message schedule and are initialized as follows: |
| w19 <= W_3 | W_2 | W_1 | W_0 |
| w20 <= W_7 | W_6 | W_5 | W_4 |
| w21 <= W_11 | W_10 | W_9 | W_8 |
| w22 <= W_15 | W_14 | W_13 | W_12 */ |
| addi x2, x0, 19 |
| loopi 4, 3 |
| bn.lid x2, 0(x14++) |
| bn.sid x2, 0(x12++) |
| addi x2, x2, 1 |
| |
| /* The remaining 74 words are constructed from the lower 16 ones: |
| W_t = s1(W_(t-2)) + W_(t-7) + s0(W_(t-15)) + W_(t-16) |
| with: |
| s0(x) = (x RROT 1) xor (x RROT 8) xor (x SHR 7) |
| s1(x) = (x RROT 19) xor (x RROT 61) xor (x SHR 6) */ |
| |
| /* In the loop body below, i denotes to the i-th cycle of this loop, |
| t refers to the index t as used in the FIPS document. Each loop |
| cycle computes 4 new words of the message schedule. Hence, i runs from |
| 0 to 15, and t runs from 16 to 79. |
| Note that the assignments in comments only show the relevant 64 bit for |
| each operation. The remaining bits are (usually) clobbered as well but |
| are irrelevant for further processing. */ |
| loopi 16, 74 |
| |
| /* t <= i*4 + 16 */ |
| |
| /* Window register contents (w19 to w22) at start of cycle: |
| w19 = W_(i*4+3) | W_(i*4+2) | W_(i*4+1) | W_(i*4) |
| = W_(t-13) | W_(t-14) | W_(t-15) | W_(t-16) |
| w20 = W_(i*4+7) | W_(i*4+6) | W_(i*4+5) | W_(i*4+4) |
| = W_(t-9) | W_(t-10) | W_(t-11) | W_(t-12) |
| w21 = W_(i*4+11) | W_(i*4+10) | W_(i*4+9) | W_(i*4+8) |
| = W_(t-5) | W_(t-6) | W_(t-7) | W_(t-8) |
| w22 = W_(i*4+15) | W_(i*4+14) | W_(i*4+13) | W_(i*4+12) |
| = W_(t-1) | W_(t-2) | W_(t-3) | W_(t-4) */ |
| |
| /* w15[255:192] <= s0( W_(t-15) ) |
| = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ |
| bn.rshi w18, w19, w30 >> 128 |
| bn.rshi w17, w30, w19 >> 64 |
| bn.rshi w15, w17, w18 >> 1 |
| bn.rshi w16, w17, w18 >> 8 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 7 |
| bn.xor w15, w15, w16 |
| |
| /* w23[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ |
| bn.add w23, w19, w15 >> 192 |
| bn.add w23, w23, w21 >> 64 |
| |
| /* w15[255:192] <= s1( W_(t-2) ) |
| = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ |
| bn.rshi w18, w22, w30 >> 192 |
| bn.rshi w17, w30, w22 >> 128 |
| bn.rshi w15, w17, w18 >> 19 |
| bn.rshi w16, w17, w18 >> 61 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 6 |
| bn.xor w15, w15, w16 |
| |
| /* w23[63:0] = w_t |
| <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ |
| bn.add w23, w23, w15 >> 192 |
| |
| |
| /* t <= i*4 + 17 |
| w19 = W_(t-14) | W_(t-15) | W_(t-16) | W_(t-17) |
| w20 = W_(t-10) | W_(t-11) | W_(t-12) | W_(t-13) |
| w21 = W_(t-6) | W_(t-7) | W_(t-8) | W_(t-9) |
| w22 = W_(t-2) | W_(t-3) | W_(t-4) | W_(t-5) |
| w23[63:0] = W_(t-1) */ |
| |
| /* w15[255:192] <= s0( W_(t-15) ) |
| = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ |
| bn.rshi w18, w19, w30 >> 192 |
| bn.rshi w17, w30, w19 >> 128 |
| bn.rshi w15, w17, w18 >> 1 |
| bn.rshi w16, w17, w18 >> 8 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 7 |
| bn.xor w15, w15, w16 |
| |
| /* w24[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ |
| bn.add w24, w31, w19 >> 64 |
| bn.add w24, w24, w15 >> 192 |
| bn.add w24, w24, w21 >> 128 |
| |
| /* w15[255:192] <= s1( W_(t-2) ) |
| = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ |
| bn.rshi w17, w30, w22 >> 192 |
| bn.rshi w15, w17, w22 >> 19 |
| bn.rshi w16, w17, w22 >> 61 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w22 >> 6 |
| bn.xor w15, w15, w16 |
| |
| /* w24[63:0] = w_t |
| <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ |
| bn.add w24, w24, w15 >> 192 |
| |
| |
| /* t = i*4 + 18 |
| w19 = W_(t-15) | W_(t-16) | W_(t-17) | W_(t-18) |
| w20 = W_(t-11) | W_(t-12) | W_(t-13) | W_(t-14) |
| w21 = W_(t-7) | W_(t-8) | W_(t-9) | W_(t-10) |
| w22 = W_(t-3) | W_(t-4) | W_(t-5) | W_(t-6) |
| w23[63:0] = W_(t-2) |
| w24[63:0] = W_(t-1) */ |
| |
| /* w15[255:192] <= s0( W_(t-15) ) |
| = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ |
| bn.rshi w17, w30, w19 >> 192 |
| bn.rshi w15, w17, w19 >> 1 |
| bn.rshi w16, w17, w19 >> 8 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w19 >> 7 |
| bn.xor w15, w15, w16 |
| |
| /* w25[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ |
| bn.add w25, w31, w19 >> 128 |
| bn.add w25, w25, w15 >> 192 |
| bn.add w25, w25, w21 >> 192 |
| |
| /* w15[255:192] <= s1( W_(t-2) ) |
| = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ |
| bn.rshi w18, w23, w30 >> 64 |
| bn.rshi w15, w23, w18 >> 19 |
| bn.rshi w16, w23, w18 >> 61 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 6 |
| bn.xor w15, w15, w16 |
| |
| /* w25[63:0] = w_t |
| <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ |
| bn.add w25, w25, w15 >> 192 |
| |
| |
| /* t = i*4 + 19 |
| w19 = W_(t-16) | W_(t-17) | W_(t-18) | W_(t-19) |
| w20 = W_(t-12) | W_(t-13) | W_(t-14) | W_(t-15) |
| w21 = W_(t-8) | W_(t-9) | W_(t-10) | W_(t-11) |
| w22 = W_(t-4) | W_(t-5) | W_(t-6) | W_(t-7) |
| w23[63:0] = W_(t-3) |
| w24[63:0] = W_(t-2) |
| w25[63:0] = W_(t-1) */ |
| |
| /* w15[255:192] <= s0( W_(t-15) ) |
| = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ |
| bn.rshi w18, w20, w30 >> 64 |
| bn.rshi w15, w20, w18 >> 1 |
| bn.rshi w16, w20, w18 >> 8 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 7 |
| bn.xor w15, w15, w16 |
| |
| /* w26[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ |
| bn.add w26, w31, w19 >> 192 |
| bn.add w26, w26, w15 >> 192 |
| bn.add w26, w26, w22 |
| |
| /* w15[255:192] <= s1( W_(t-2) ) |
| = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ |
| bn.rshi w18, w24, w30 >> 64 |
| bn.rshi w15, w24, w18 >> 19 |
| bn.rshi w16, w24, w18 >> 61 |
| bn.xor w15, w15, w16 |
| bn.rshi w16, w31, w18 >> 6 |
| bn.xor w15, w15, w16 |
| |
| /* w26[63:0] = w_t |
| <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ |
| bn.add w26, w26, w15 >> 192 |
| |
| /* Forward window */ |
| bn.mov w19, w20 |
| bn.mov w20, w21 |
| bn.mov w21, w22 |
| |
| /* Assemble 256-bit cell from the 4 words computed above */ |
| /* w22 = w26[63:0] | w25[63:0] | w24[63:0] | w23[64:0] |
| = W[i*4+19] | W[i*4+18] | W[i*4+17] | W[i*4+16] */ |
| bn.rshi w22, w23, w22 >> 64 |
| bn.rshi w22, w24, w22 >> 64 |
| bn.rshi w22, w25, w22 >> 64 |
| bn.rshi w22, w26, w22 >> 64 |
| |
| |
| /* Store the 4 words in dmem scratchpad */ |
| bn.sid x19, 0(x12++) |
| |
| |
| /* load state variables from dmem */ |
| addi x2, x0, 0 |
| /* w0[63:0] = a <= H_0 */ |
| bn.lid x2++, 0(x17) |
| /* w1[63:0] = b <= H_1 */ |
| bn.lid x2++, 32(x17) |
| /* w2[63:0] = c <= H_2 */ |
| bn.lid x2++, 64(x17) |
| /* w3[63:0] = d <= H_3 */ |
| bn.lid x2++, 96(x17) |
| /* w4[63:0] = e <= H_4 */ |
| bn.lid x2++, 128(x17) |
| /* w5[63:0] = f <= H_5 */ |
| bn.lid x2++, 160(x17) |
| /* w6[63:0] = g <= H_6 */ |
| bn.lid x2++, 192(x17) |
| /* w7[63:0] = h <= H_7 */ |
| bn.lid x2++, 224(x17) |
| |
| /* reset pointer to start of message schedule scratchpad in dmem */ |
| addi x12, x13, 0 |
| |
| /* reset pointer to beginning of dmem section containing round constants */ |
| addi x15, x16, 0 |
| |
| /* Main loop for SHA compression function. Processes 8 words of message |
| schedule in one cycle. |
| This saves copying the SHA working variables (a,b,...,h) after each |
| word. If code size becomes an issue, the size of the loop body can |
| be significantly reduced for the penalty 6 additional instructions after |
| each word. |
| Below, |
| i denotes the current loop cycle, and |
| t denotes the current word (hence t=i*8+(0,1,...,7) ). */ |
| loopi 10, 253 |
| |
| /* Load four round constants from dmem */ |
| /* w10 <= [K_(i*8+3),K_(i*8+2),K_(i*8+1),K_(i*8)] = dmem[K + 2*i] */ |
| bn.lid x10, 0(x15++) |
| |
| /* Load four message schedule words from dmem scratchpad */ |
| /* w11 <= [W_(i*8+3),W_(i*8+2),W_(i*8+1),W_(i*8)] = dmem[W + 2*i] */ |
| bn.lid x11, 0(x12++) |
| |
| /* w6[63:0] = g */ |
| /* w5[63:0] = f */ |
| /* w4[63:0] = e */ |
| /* w3[63:0] = d */ |
| /* w2[63:0] = c */ |
| /* w1[63:0] = b */ |
| /* w0[63:0] = a */ |
| |
| |
| /* Process word 0 of loop cycle: t <= i*8. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w0, w30 >> 64 |
| bn.rshi w15, w0, w22 >> 28 |
| bn.rshi w21, w0, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w0, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w0, w1 |
| bn.and w21, w0, w2 |
| bn.xor w16, w16, w21 |
| bn.and w21, w1, w2 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w4, w30 >> 64 |
| bn.rshi w18, w4, w22 >> 14 |
| bn.rshi w21, w4, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w4, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w4, w5 |
| bn.not w21, w4 |
| bn.and w21, w21, w6 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w7 |
| bn.add w20, w20, w10 |
| bn.add w21, w11, w19 |
| bn.add w20, w20, w21 |
| |
| /* w6[63:0] = h <= g */ |
| /* w5[63:0] = g <= f */ |
| /* w4[63:0] = f <= e */ |
| /* w3[63:0] = e <= d + T1 = w3[63:0] + w20[63:0] */ |
| bn.add w3, w3, w20 |
| /* w2[63:0] = d <= c */ |
| /* w1[63:0] = c <= b */ |
| /* w0[63:0] = b <= a */ |
| /* w7[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w7, w20, w17 |
| |
| |
| /* Process word 1 of loop cycle: t <= i*8 + 1. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w7, w30 >> 64 |
| bn.rshi w15, w7, w22 >> 28 |
| bn.rshi w21, w7, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w7, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w7, w0 |
| bn.and w21, w7, w1 |
| bn.xor w16, w16, w21 |
| bn.and w21, w0, w1 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w3, w30 >> 64 |
| bn.rshi w18, w3, w22 >> 14 |
| bn.rshi w21, w3, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w3, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w3, w4 |
| bn.not w21, w3 |
| bn.and w21, w21, w5 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w6 |
| bn.add w20, w20, w10 >> 64 |
| bn.rshi w21, w30, w11 >> 64 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w5[63:0] = h <= g */ |
| /* w4[63:0] = g <= f */ |
| /* w3[63:0] = f <= e */ |
| /* w2[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w2, w2, w20 |
| /* w1[63:0] = d <= c */ |
| /* w0[63:0] = c <= b */ |
| /* w7[63:0] = b <= a */ |
| /* w6[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w6, w20, w17 |
| |
| |
| /* Process word 2 of loop cycle: t <= i*8 + 2. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w6, w30 >> 64 |
| bn.rshi w15, w6, w22 >> 28 |
| bn.rshi w21, w6, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w6, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w6, w7 |
| bn.and w21, w6, w0 |
| bn.xor w16, w16, w21 |
| bn.and w21, w7, w0 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w2, w30 >> 64 |
| bn.rshi w18, w2, w22 >> 14 |
| bn.rshi w21, w2, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w2, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w2, w3 |
| bn.not w21, w2 |
| bn.and w21, w21, w4 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w5 |
| bn.add w20, w20, w10 >> 128 |
| bn.rshi w21, w30, w11 >> 128 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w4[63:0] = h <= g */ |
| /* w3[63:0] = g <= f */ |
| /* w2[63:0] = f <= e */ |
| /* w1[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w1, w1, w20 |
| /* w0[63:0] = d <= c */ |
| /* w7[63:0] = c <= b */ |
| /* w6[63:0] = b <= a */ |
| /* w5[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w5, w20, w17 |
| |
| |
| /* Process word 3 of loop cycle: t <= i*8 + 3. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w5, w30 >> 64 |
| bn.rshi w15, w5, w22 >> 28 |
| bn.rshi w21, w5, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w5, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w5, w6 |
| bn.and w21, w5, w7 |
| bn.xor w16, w16, w21 |
| bn.and w21, w6, w7 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w1, w30 >> 64 |
| bn.rshi w18, w1, w22 >> 14 |
| bn.rshi w21, w1, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w1, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w1, w2 |
| bn.not w21, w1 |
| bn.and w21, w21, w3 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w4 |
| bn.add w20, w20, w10 >> 192 |
| bn.rshi w21, w30, w11 >> 192 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w3[63:0] = h <= g */ |
| /* w2[63:0] = g <= f */ |
| /* w1[63:0] = f <= e */ |
| /* w0[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w0, w0, w20 |
| /* w7[63:0] = d <= c */ |
| /* w6[63:0] = c <= b */ |
| /* w5[63:0] = b <= a */ |
| /* w4[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w4, w20, w17 |
| |
| |
| /* Load another four round constants from dmem */ |
| /* w10 <= [K_(i*8+7),K_(i*8+6),K_(i*8+5),K_(i*8+4)] = dmem[K + 2*i+1] */ |
| bn.lid x10, 0(x15++) |
| |
| /* Load another four message schedule words from dmem scratchpad */ |
| /* w11 <= [W_(i*8+7),W_(i*8+6),W_(i*8+5),W_(i*8+4)] = dmem[W + 2*i+1] */ |
| bn.lid x11, 0(x12++) |
| |
| |
| /* Process word 4 of loop cycle: t <= i*8 + 3. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w4, w30 >> 64 |
| bn.rshi w15, w4, w22 >> 28 |
| bn.rshi w21, w4, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w4, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w4, w5 |
| bn.and w21, w4, w6 |
| bn.xor w16, w16, w21 |
| bn.and w21, w5, w6 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w0, w30 >> 64 |
| bn.rshi w18, w0, w22 >> 14 |
| bn.rshi w21, w0, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w0, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w0, w1 |
| bn.not w21, w0 |
| bn.and w21, w21, w2 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w3 |
| bn.add w20, w20, w10 >> 0 |
| bn.rshi w21, w30, w11 >> 0 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w2[63:0] = h <= g */ |
| /* w1[63:0] = g <= f */ |
| /* w0[63:0] = f <= e */ |
| /* w7[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w7, w7, w20 |
| /* w6[63:0] = d <= c */ |
| /* w5[63:0] = c <= b */ |
| /* w4[63:0] = b <= a */ |
| /* w3[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w3, w20, w17 |
| |
| |
| /* Process word 5 of loop cycle: t <= i*8 + 3. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w3, w30 >> 64 |
| bn.rshi w15, w3, w22 >> 28 |
| bn.rshi w21, w3, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w3, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w3, w4 |
| bn.and w21, w3, w5 |
| bn.xor w16, w16, w21 |
| bn.and w21, w4, w5 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w7, w30 >> 64 |
| bn.rshi w18, w7, w22 >> 14 |
| bn.rshi w21, w7, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w7, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w7, w0 |
| bn.not w21, w7 |
| bn.and w21, w21, w1 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w2 |
| bn.add w20, w20, w10 >> 64 |
| bn.rshi w21, w30, w11 >> 64 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w1[63:0] = h <= g */ |
| /* w0[63:0] = g <= f */ |
| /* w7[63:0] = f <= e */ |
| /* w6[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w6, w6, w20 |
| /* w5[63:0] = d <= c */ |
| /* w4[63:0] = c <= b */ |
| /* w3[63:0] = b <= a */ |
| /* w2[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w2, w20, w17 |
| |
| /* Process word 6 of loop cycle: t <= i*8 + 3. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w2, w30 >> 64 |
| bn.rshi w15, w2, w22 >> 28 |
| bn.rshi w21, w2, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w2, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w2, w3 |
| bn.and w21, w2, w4 |
| bn.xor w16, w16, w21 |
| bn.and w21, w3, w4 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w6, w30 >> 64 |
| bn.rshi w18, w6, w22 >> 14 |
| bn.rshi w21, w6, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w6, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w6, w7 |
| bn.not w21, w6 |
| bn.and w21, w21, w0 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w1 |
| bn.add w20, w20, w10 >> 128 |
| bn.rshi w21, w30, w11 >> 128 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w0[63:0] = h <= g */ |
| /* w7[63:0] = g <= f */ |
| /* w6[63:0] = f <= e */ |
| /* w5[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w5, w5, w20 |
| /* w4[63:0] = d <= c */ |
| /* w3[63:0] = c <= b */ |
| /* w2[63:0] = b <= a */ |
| /* w1[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w1, w20, w17 |
| |
| |
| /* Process word 7 of loop cycle: t <= i*8 + 3. */ |
| |
| /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ |
| bn.rshi w22, w1, w30 >> 64 |
| bn.rshi w15, w1, w22 >> 28 |
| bn.rshi w21, w1, w22 >> 34 |
| bn.xor w15, w15, w21 |
| bn.rshi w21, w1, w22 >> 39 |
| bn.xor w15, w15, w21 |
| |
| /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ |
| bn.and w16, w1, w2 |
| bn.and w21, w1, w3 |
| bn.xor w16, w16, w21 |
| bn.and w21, w2, w3 |
| bn.xor w16, w16, w21 |
| |
| /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ |
| bn.rshi w17, w30, w15 >> 192 |
| bn.add w17, w17, w16 |
| |
| /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ |
| bn.rshi w22, w5, w30 >> 64 |
| bn.rshi w18, w5, w22 >> 14 |
| bn.rshi w21, w5, w22 >> 18 |
| bn.xor w18, w18, w21 |
| bn.rshi w19, w5, w22 >> 41 |
| bn.xor w18, w18, w19 |
| |
| /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ |
| bn.and w19, w5, w6 |
| bn.not w21, w5 |
| bn.and w21, w21, w7 |
| bn.xor w19, w19, w21 |
| |
| /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ |
| bn.rshi w20, w30, w18 >> 192 |
| bn.add w20, w20, w0 |
| bn.add w20, w20, w10 >> 192 |
| bn.rshi w21, w30, w11 >> 192 |
| bn.add w21, w21, w19 |
| bn.add w20, w20, w21 |
| |
| /* w7[63:0] = h <= g */ |
| /* w6[63:0] = g <= f */ |
| /* w5[63:0] = f <= e */ |
| /* w4[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ |
| bn.add w4, w4, w20 |
| /* w3[63:0] = d <= c */ |
| /* w2[63:0] = c <= b */ |
| /* w1[63:0] = b <= a */ |
| /* w0[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ |
| bn.add w0, w20, w17 |
| |
| |
| |
| /* Add compressed chunk to current hash value */ |
| addi x2, x0, 15 |
| |
| /* H_0 <= H_0 + a */ |
| bn.lid x2, 0(x17) |
| bn.add w15, w0, w15 |
| bn.sid x2, 0(x17) |
| |
| /* H_1 <= H_1 + b */ |
| bn.lid x2, 32(x17) |
| bn.add w15, w1, w15 |
| bn.sid x2, 32(x17) |
| |
| /* H_2 <= H_2 + c */ |
| bn.lid x2, 64(x17) |
| bn.add w15, w2, w15 |
| bn.sid x2, 64(x17) |
| |
| /* H_3 <= H_3 + d */ |
| bn.lid x2, 96(x17) |
| bn.add w15, w3, w15 |
| bn.sid x2, 96(x17) |
| |
| /* H_4 <= H_4 + e */ |
| bn.lid x2, 128(x17) |
| bn.add w15, w4, w15 |
| bn.sid x2, 128(x17) |
| |
| /* H_5 <= H_5 + f */ |
| bn.lid x2, 160(x17) |
| bn.add w15, w5, w15 |
| bn.sid x2, 160(x17) |
| |
| /* H_6 <= H_6 + g */ |
| bn.lid x2, 192(x17) |
| bn.add w15, w6, w15 |
| bn.sid x2, 192(x17) |
| |
| /* H_7 <= H_7 + h */ |
| bn.lid x2, 224(x17) |
| bn.add w15, w7, w15 |
| bn.sid x2, 224(x17) |
| |
| ret |
| |
| .bss |
| |
| /* number of chunks to process */ |
| .globl n_chunks |
| .balign 4 |
| n_chunks: |
| .zero 4 |
| |
| /* pointer to state (dptr_state) */ |
| .globl dptr_state |
| .balign 4 |
| dptr_state: |
| .zero 4 |
| |
| /* pointer to msg (dptr_msg) */ |
| .globl dptr_msg |
| .balign 4 |
| dptr_msg: |
| .zero 4 |
| |
| |
| /* 80*8=640 bytes scratchpad for message schedule */ |
| .section .scratchpad |
| .balign 32 |
| W: |
| .zero 640 |
| |
| .data |
| |
| /* SHA-512 round constants */ |
| .balign 32 |
| K: |
| .dword 0x428a2f98d728ae22 |
| .dword 0x7137449123ef65cd |
| .dword 0xb5c0fbcfec4d3b2f |
| .dword 0xe9b5dba58189dbbc |
| .dword 0x3956c25bf348b538 |
| .dword 0x59f111f1b605d019 |
| .dword 0x923f82a4af194f9b |
| .dword 0xab1c5ed5da6d8118 |
| .dword 0xd807aa98a3030242 |
| .dword 0x12835b0145706fbe |
| .dword 0x243185be4ee4b28c |
| .dword 0x550c7dc3d5ffb4e2 |
| .dword 0x72be5d74f27b896f |
| .dword 0x80deb1fe3b1696b1 |
| .dword 0x9bdc06a725c71235 |
| .dword 0xc19bf174cf692694 |
| .dword 0xe49b69c19ef14ad2 |
| .dword 0xefbe4786384f25e3 |
| .dword 0x0fc19dc68b8cd5b5 |
| .dword 0x240ca1cc77ac9c65 |
| .dword 0x2de92c6f592b0275 |
| .dword 0x4a7484aa6ea6e483 |
| .dword 0x5cb0a9dcbd41fbd4 |
| .dword 0x76f988da831153b5 |
| .dword 0x983e5152ee66dfab |
| .dword 0xa831c66d2db43210 |
| .dword 0xb00327c898fb213f |
| .dword 0xbf597fc7beef0ee4 |
| .dword 0xc6e00bf33da88fc2 |
| .dword 0xd5a79147930aa725 |
| .dword 0x06ca6351e003826f |
| .dword 0x142929670a0e6e70 |
| .dword 0x27b70a8546d22ffc |
| .dword 0x2e1b21385c26c926 |
| .dword 0x4d2c6dfc5ac42aed |
| .dword 0x53380d139d95b3df |
| .dword 0x650a73548baf63de |
| .dword 0x766a0abb3c77b2a8 |
| .dword 0x81c2c92e47edaee6 |
| .dword 0x92722c851482353b |
| .dword 0xa2bfe8a14cf10364 |
| .dword 0xa81a664bbc423001 |
| .dword 0xc24b8b70d0f89791 |
| .dword 0xc76c51a30654be30 |
| .dword 0xd192e819d6ef5218 |
| .dword 0xd69906245565a910 |
| .dword 0xf40e35855771202a |
| .dword 0x106aa07032bbd1b8 |
| .dword 0x19a4c116b8d2d0c8 |
| .dword 0x1e376c085141ab53 |
| .dword 0x2748774cdf8eeb99 |
| .dword 0x34b0bcb5e19b48a8 |
| .dword 0x391c0cb3c5c95a63 |
| .dword 0x4ed8aa4ae3418acb |
| .dword 0x5b9cca4f7763e373 |
| .dword 0x682e6ff3d6b2b8a3 |
| .dword 0x748f82ee5defb2fc |
| .dword 0x78a5636f43172f60 |
| .dword 0x84c87814a1f0ab72 |
| .dword 0x8cc702081a6439ec |
| .dword 0x90befffa23631e28 |
| .dword 0xa4506cebde82bde9 |
| .dword 0xbef9a3f7b2c67915 |
| .dword 0xc67178f2e372532b |
| .dword 0xca273eceea26619c |
| .dword 0xd186b8c721c0c207 |
| .dword 0xeada7dd6cde0eb1e |
| .dword 0xf57d4f7fee6ed178 |
| .dword 0x06f067aa72176fba |
| .dword 0x0a637dc5a2c898a6 |
| .dword 0x113f9804bef90dae |
| .dword 0x1b710b35131c471b |
| .dword 0x28db77f523047d84 |
| .dword 0x32caab7b40c72493 |
| .dword 0x3c9ebe0a15c9bebc |
| .dword 0x431d67c49c100d4c |
| .dword 0x4cc5d4becb3e42b6 |
| .dword 0x597f299cfc657e2a |
| .dword 0x5fcb6fab3ad6faec |
| .dword 0x6c44198c4a475817 |