|  | /* Copyright lowRISC contributors. | 
|  | * Copyright 2016 The Chromium OS Authors. All rights reserved. | 
|  | * Use of this source code is governed by a BSD-style license that can be | 
|  | * found in the LICENSE.dcrypto file. | 
|  | */ | 
|  |  | 
|  | /** | 
|  | * This library implements hash computation as specified in FIPS PUB 180-4 | 
|  | * "Secure Hash Standard (SHS)". | 
|  | * | 
|  | * Terminology within the comments in this library is based (as much as | 
|  | * possible) on the terminology of FIPS 180-4. | 
|  | * | 
|  | * The functions named with the greek sigma have been renamed: | 
|  | *   - sigma_lowercase(x) -> s(x) | 
|  | *   - sigma_uppercase(x) -> S(x) | 
|  | * | 
|  | * Upercase W_i denotes the i-th word from the message schedule. | 
|  | * Uppercase M_i denotes the i_th message word of the current chunk. | 
|  | * Uppercase K_i denotes the i-th round constant. | 
|  | */ | 
|  |  | 
|  |  | 
|  | .section .text | 
|  |  | 
|  |  | 
|  | /** | 
|  | * Compute SHA-512 hash | 
|  | * | 
|  | * Updates the SHA-512 state for n subsequent 1024-bit chunks of a | 
|  | * pre-formatted message. | 
|  | * | 
|  | * The message is expected in dmem in a pre-processed format: | 
|  | *  - The message has been padded according to the SHA-512 standard. | 
|  | *  - The padded message has been broken down into 64-bit sized big-endian | 
|  | *    words. I.e. for a message stored at dmem address a, the expected | 
|  | *    formating for the first 16 message bytes is as follows | 
|  | *    (where mn denotes the n-th message byte): | 
|  | *    |a+15|a+14|a+13|a+12|a+11|a+10|a+9|a+8|a+7|a+6|a+5|a+4|a+3|a+2|a+1|a+0| | 
|  | *    |  m8|  m9| m10| m11| m12| m13|m14|m15| m0| m1| m2| m3| m4| m5| m6| m7| | 
|  | * | 
|  | * The state variables H[0] to H[7] are expected in dmem in 8 subsequent | 
|  | * 256-bit memory cells, where each state variable occupies the lower 64 bit | 
|  | * of such a cell. For the state stored at dmem address a the expected format | 
|  | * is as follows: | 
|  | *  dmem[a +    ][63:0]: H[0] | 
|  | *  dmem[a +  32][63:0]: H[1] | 
|  | *  dmem[a +  64][63:0]: H[2] | 
|  | *  dmem[a +  96][63:0]: H[3] | 
|  | *  dmem[a + 128][63:0]: H[4] | 
|  | *  dmem[a + 160][63:0]: H[5] | 
|  | *  dmem[a + 192][63:0]: H[6] | 
|  | *  dmem[a + 224][63:0]: H[7] | 
|  | * The upper 192 bits of each cell are clobbered during the execution of the | 
|  | * algorithm but their contents are irrelevant. | 
|  | * | 
|  | * The routine makes use of a 640 byte sized scratchpad in dmem for the message | 
|  | * schedule. | 
|  | * | 
|  | * This routine runs in constant time. | 
|  | * | 
|  | * Flags: Flags have no meaning beyond the scope of this subroutine. | 
|  | * | 
|  | * @param[in]  dmem[n_chunks]: number of chunks to process in a single go | 
|  | * @param[in]  dmem[dptr_state]: dmem location with state      ][63:0]: H[0] | 
|  | * @param[in]  dmem[dptr_msg]: Pointer to memory location containing the pre- | 
|  | *                               formatted message chunks. | 
|  | * | 
|  | * clobbered registers: w0 to w7, w10, w11, w15 to w26, w30, w31 | 
|  | *                      x1, x2, x10, x11 to x17, x20 | 
|  | * clobbered flag groups: FG0 | 
|  | */ | 
|  | .globl sha512 | 
|  | sha512: | 
|  |  | 
|  | /* w31 = 0; w30 = 1111...1111 */ | 
|  | bn.xor  w31, w31, w31 | 
|  | bn.subi w30, w31, 1 | 
|  |  | 
|  | /* read number of 1024-bit chunks from dmem */ | 
|  | la x20, n_chunks | 
|  | lw x20, 0(x20) | 
|  |  | 
|  | /* read pointer to state variables from dmem */ | 
|  | la x17, dptr_state | 
|  | lw x17, 0(x17) | 
|  |  | 
|  | /* read pointer to message buffer from dmem */ | 
|  | la x14, dptr_msg | 
|  | lw x14, 0(x14) | 
|  |  | 
|  | /* init reg pointers */ | 
|  | li x10, 10 | 
|  | li x11, 11 | 
|  | li x19, 22 | 
|  |  | 
|  | /* init pointer to scratchpad for message schedule */ | 
|  | la x13, W | 
|  |  | 
|  | /* init pointer to round constants */ | 
|  | la x16, K | 
|  |  | 
|  | /* one iteration per chunk */ | 
|  | loop x20, 369 | 
|  |  | 
|  | /* reset pointer to message schedule */ | 
|  | addi x12, x13, 0 | 
|  |  | 
|  | /* Expand 1024 bit data chunk to full message schedule (W_0 ... W_79) | 
|  | The 80 64-bit words of the message schedule are kept in dmem | 
|  | scatchpad (20 256-bit cells). */ | 
|  |  | 
|  | /* The message schedule's 16 lower words (W_0 to W_15) are set equal to the | 
|  | 16 words of the message chunk (M_0 to M_15). | 
|  |  | 
|  | The WDRs w19 to w22 are used as a sliding window over 16 words of the | 
|  | message schedule and are initialized as follows: | 
|  | w19 <=  W_3  | W_2  | W_1  | W_0 | 
|  | w20 <=  W_7  | W_6  | W_5  | W_4 | 
|  | w21 <=  W_11 | W_10 | W_9  | W_8 | 
|  | w22 <=  W_15 | W_14 | W_13 | W_12 */ | 
|  | addi    x2, x0, 19 | 
|  | loopi   4, 3 | 
|  | bn.lid  x2, 0(x14++) | 
|  | bn.sid  x2, 0(x12++) | 
|  | addi    x2, x2, 1 | 
|  |  | 
|  | /* The remaining 74 words are constructed from the lower 16 ones: | 
|  | W_t = s1(W_(t-2)) + W_(t-7) + s0(W_(t-15)) + W_(t-16) | 
|  | with: | 
|  | s0(x) = (x RROT 1) xor (x RROT 8) xor (x SHR 7) | 
|  | s1(x) = (x RROT 19) xor (x RROT 61) xor (x SHR 6) */ | 
|  |  | 
|  | /* In the loop body below, i denotes to the i-th cycle of this loop, | 
|  | t refers to the index t as used in the FIPS document. Each loop | 
|  | cycle computes 4 new words of the message schedule. Hence, i runs from | 
|  | 0 to 15, and t runs from 16 to 79. | 
|  | Note that the assignments in comments only show the relevant 64 bit for | 
|  | each operation. The remaining bits are (usually) clobbered as well but | 
|  | are irrelevant for further processing. */ | 
|  | loopi   16, 74 | 
|  |  | 
|  | /* t <= i*4 + 16 */ | 
|  |  | 
|  | /* Window register contents (w19 to w22) at start of cycle: | 
|  | w19 = W_(i*4+3)  | W_(i*4+2)  | W_(i*4+1)  | W_(i*4) | 
|  | = W_(t-13)   | W_(t-14)   | W_(t-15)   | W_(t-16) | 
|  | w20 = W_(i*4+7)  | W_(i*4+6)  | W_(i*4+5)  | W_(i*4+4) | 
|  | = W_(t-9)    | W_(t-10)   | W_(t-11)   | W_(t-12) | 
|  | w21 = W_(i*4+11) | W_(i*4+10) | W_(i*4+9)  | W_(i*4+8) | 
|  | = W_(t-5)    | W_(t-6)    | W_(t-7)    | W_(t-8) | 
|  | w22 = W_(i*4+15) | W_(i*4+14) | W_(i*4+13) | W_(i*4+12) | 
|  | = W_(t-1)    | W_(t-2)    | W_(t-3)    | W_(t-4) */ | 
|  |  | 
|  | /* w15[255:192] <= s0( W_(t-15) ) | 
|  | = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ | 
|  | bn.rshi  w18, w19, w30 >> 128 | 
|  | bn.rshi  w17, w30, w19 >> 64 | 
|  | bn.rshi  w15, w17, w18 >> 1 | 
|  | bn.rshi  w16, w17, w18 >> 8 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18 >> 7 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w23[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ | 
|  | bn.add   w23, w19, w15 >> 192 | 
|  | bn.add   w23, w23, w21 >> 64 | 
|  |  | 
|  | /* w15[255:192] <= s1( W_(t-2) ) | 
|  | = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ | 
|  | bn.rshi  w18, w22, w30  >> 192 | 
|  | bn.rshi  w17, w30, w22  >> 128 | 
|  | bn.rshi  w15, w17, w18  >> 19 | 
|  | bn.rshi  w16, w17, w18  >> 61 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18  >> 6 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w23[63:0] = w_t | 
|  | <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ | 
|  | bn.add   w23, w23, w15 >> 192 | 
|  |  | 
|  |  | 
|  | /* t <= i*4 + 17 | 
|  | w19       = W_(t-14) | W_(t-15) | W_(t-16) | W_(t-17) | 
|  | w20       = W_(t-10) | W_(t-11) | W_(t-12) | W_(t-13) | 
|  | w21       = W_(t-6)  | W_(t-7) |  W_(t-8)  | W_(t-9) | 
|  | w22       = W_(t-2)  | W_(t-3)  | W_(t-4)  | W_(t-5) | 
|  | w23[63:0] = W_(t-1) */ | 
|  |  | 
|  | /* w15[255:192] <= s0( W_(t-15) ) | 
|  | = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ | 
|  | bn.rshi  w18, w19, w30  >> 192 | 
|  | bn.rshi  w17, w30, w19  >> 128 | 
|  | bn.rshi  w15, w17, w18  >> 1 | 
|  | bn.rshi  w16, w17, w18  >> 8 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18  >> 7 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w24[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ | 
|  | bn.add   w24, w31, w19 >> 64 | 
|  | bn.add   w24, w24, w15 >> 192 | 
|  | bn.add   w24, w24, w21 >> 128 | 
|  |  | 
|  | /* w15[255:192] <= s1( W_(t-2) ) | 
|  | = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ | 
|  | bn.rshi  w17, w30, w22  >> 192 | 
|  | bn.rshi  w15, w17, w22  >> 19 | 
|  | bn.rshi  w16, w17, w22  >> 61 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w22  >> 6 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w24[63:0] = w_t | 
|  | <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ | 
|  | bn.add   w24, w24, w15 >> 192 | 
|  |  | 
|  |  | 
|  | /* t = i*4 + 18 | 
|  | w19       = W_(t-15) | W_(t-16) | W_(t-17) | W_(t-18) | 
|  | w20       = W_(t-11) | W_(t-12) | W_(t-13) | W_(t-14) | 
|  | w21       = W_(t-7)  | W_(t-8) |  W_(t-9)  | W_(t-10) | 
|  | w22       = W_(t-3)  | W_(t-4)  | W_(t-5)  | W_(t-6) | 
|  | w23[63:0] = W_(t-2) | 
|  | w24[63:0] = W_(t-1) */ | 
|  |  | 
|  | /* w15[255:192] <= s0( W_(t-15) ) | 
|  | = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ | 
|  | bn.rshi  w17, w30, w19  >> 192 | 
|  | bn.rshi  w15, w17, w19  >> 1 | 
|  | bn.rshi  w16, w17, w19  >> 8 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w19  >> 7 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w25[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ | 
|  | bn.add   w25, w31, w19 >> 128 | 
|  | bn.add   w25, w25, w15 >> 192 | 
|  | bn.add   w25, w25, w21 >> 192 | 
|  |  | 
|  | /* w15[255:192] <= s1( W_(t-2) ) | 
|  | = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ | 
|  | bn.rshi  w18, w23, w30  >> 64 | 
|  | bn.rshi  w15, w23, w18  >> 19 | 
|  | bn.rshi  w16, w23, w18  >> 61 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18  >> 6 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w25[63:0] = w_t | 
|  | <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ | 
|  | bn.add   w25, w25, w15 >> 192 | 
|  |  | 
|  |  | 
|  | /* t = i*4 + 19 | 
|  | w19       = W_(t-16) | W_(t-17) | W_(t-18) | W_(t-19) | 
|  | w20       = W_(t-12) | W_(t-13) | W_(t-14) | W_(t-15) | 
|  | w21       = W_(t-8)  | W_(t-9) |  W_(t-10) | W_(t-11) | 
|  | w22       = W_(t-4)  | W_(t-5)  | W_(t-6)  | W_(t-7) | 
|  | w23[63:0] = W_(t-3) | 
|  | w24[63:0] = W_(t-2) | 
|  | w25[63:0] = W_(t-1) */ | 
|  |  | 
|  | /* w15[255:192] <= s0( W_(t-15) ) | 
|  | = (W_(t-15) ROTR 1) XOR (W_(t-15) ROTR 8) XOR (W_(t-15) SHR 8) */ | 
|  | bn.rshi  w18, w20, w30  >> 64 | 
|  | bn.rshi  w15, w20, w18  >> 1 | 
|  | bn.rshi  w16, w20, w18  >> 8 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18  >> 7 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w26[63:0] <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) */ | 
|  | bn.add   w26, w31, w19 >> 192 | 
|  | bn.add   w26, w26, w15 >> 192 | 
|  | bn.add   w26, w26, w22 | 
|  |  | 
|  | /* w15[255:192] <= s1( W_(t-2) ) | 
|  | = (W_(t-2) ROTR 19) XOR (W_(t-2) ROTR 61) XOR (W_(t-2) SHR 6) */ | 
|  | bn.rshi  w18, w24, w30  >> 64 | 
|  | bn.rshi  w15, w24, w18  >> 19 | 
|  | bn.rshi  w16, w24, w18  >> 61 | 
|  | bn.xor   w15, w15, w16 | 
|  | bn.rshi  w16, w31, w18  >> 6 | 
|  | bn.xor   w15, w15, w16 | 
|  |  | 
|  | /* w26[63:0] = w_t | 
|  | <= W_(t-16) + W_(t-7) + s0( W_(t-15) ) + s1( W_(t-2) ) */ | 
|  | bn.add   w26, w26, w15 >> 192 | 
|  |  | 
|  | /* Forward window */ | 
|  | bn.mov   w19, w20 | 
|  | bn.mov   w20, w21 | 
|  | bn.mov   w21, w22 | 
|  |  | 
|  | /* Assemble 256-bit cell from the 4 words computed above */ | 
|  | /* w22 = w26[63:0] | w25[63:0] | w24[63:0] | w23[64:0] | 
|  | = W[i*4+19] | W[i*4+18] | W[i*4+17] | W[i*4+16] */ | 
|  | bn.rshi  w22, w23, w22  >> 64 | 
|  | bn.rshi  w22, w24, w22  >> 64 | 
|  | bn.rshi  w22, w25, w22  >> 64 | 
|  | bn.rshi  w22, w26, w22  >> 64 | 
|  |  | 
|  |  | 
|  | /* Store the 4 words in dmem scratchpad */ | 
|  | bn.sid   x19, 0(x12++) | 
|  |  | 
|  |  | 
|  | /* load state variables from dmem */ | 
|  | addi     x2, x0, 0 | 
|  | /* w0[63:0] = a <= H_0 */ | 
|  | bn.lid   x2++, 0(x17) | 
|  | /* w1[63:0] = b <= H_1 */ | 
|  | bn.lid   x2++, 32(x17) | 
|  | /* w2[63:0] = c <= H_2 */ | 
|  | bn.lid   x2++, 64(x17) | 
|  | /* w3[63:0] = d <= H_3 */ | 
|  | bn.lid   x2++, 96(x17) | 
|  | /* w4[63:0] = e <= H_4 */ | 
|  | bn.lid   x2++, 128(x17) | 
|  | /* w5[63:0] = f <= H_5 */ | 
|  | bn.lid   x2++, 160(x17) | 
|  | /* w6[63:0] = g <= H_6 */ | 
|  | bn.lid   x2++, 192(x17) | 
|  | /* w7[63:0] = h <= H_7 */ | 
|  | bn.lid   x2++, 224(x17) | 
|  |  | 
|  | /* reset pointer to start of message schedule scratchpad in dmem */ | 
|  | addi x12, x13, 0 | 
|  |  | 
|  | /* reset pointer to beginning of dmem section containing round constants */ | 
|  | addi x15, x16, 0 | 
|  |  | 
|  | /* Main loop for SHA compression function. Processes 8 words of message | 
|  | schedule in one cycle. | 
|  | This saves copying the SHA working variables (a,b,...,h) after each | 
|  | word. If code size becomes an issue, the size of the loop body can | 
|  | be significantly reduced for the penalty 6 additional instructions after | 
|  | each word. | 
|  | Below, | 
|  | i denotes the current loop cycle, and | 
|  | t denotes the current word (hence t=i*8+(0,1,...,7) ). */ | 
|  | loopi 10, 253 | 
|  |  | 
|  | /* Load four round constants from dmem */ | 
|  | /* w10 <= [K_(i*8+3),K_(i*8+2),K_(i*8+1),K_(i*8)] = dmem[K + 2*i] */ | 
|  | bn.lid   x10, 0(x15++) | 
|  |  | 
|  | /* Load four message schedule words from dmem scratchpad */ | 
|  | /* w11 <= [W_(i*8+3),W_(i*8+2),W_(i*8+1),W_(i*8)] = dmem[W + 2*i] */ | 
|  | bn.lid   x11, 0(x12++) | 
|  |  | 
|  | /* w6[63:0] = g */ | 
|  | /* w5[63:0] = f */ | 
|  | /* w4[63:0] = e */ | 
|  | /* w3[63:0] = d */ | 
|  | /* w2[63:0] = c */ | 
|  | /* w1[63:0] = b */ | 
|  | /* w0[63:0] = a */ | 
|  |  | 
|  |  | 
|  | /* Process word 0 of loop cycle: t <= i*8. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w0, w30  >> 64 | 
|  | bn.rshi  w15,  w0, w22  >> 28 | 
|  | bn.rshi  w21,  w0, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w0, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w0,  w1 | 
|  | bn.and   w21,  w0,  w2 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w1,  w2 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w4, w30  >> 64 | 
|  | bn.rshi  w18,  w4, w22  >> 14 | 
|  | bn.rshi  w21,  w4, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w4, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w4,  w5 | 
|  | bn.not   w21,  w4 | 
|  | bn.and   w21, w21,  w6 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20, w7 | 
|  | bn.add   w20, w20, w10 | 
|  | bn.add   w21, w11, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w6[63:0] = h <= g */ | 
|  | /* w5[63:0] = g <= f */ | 
|  | /* w4[63:0] = f <= e */ | 
|  | /* w3[63:0] = e <= d + T1 = w3[63:0] + w20[63:0] */ | 
|  | bn.add    w3,  w3, w20 | 
|  | /* w2[63:0] = d <= c */ | 
|  | /* w1[63:0] = c <= b */ | 
|  | /* w0[63:0] = b <= a */ | 
|  | /* w7[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add    w7, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Process word 1 of loop cycle: t <= i*8 + 1. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w7, w30  >> 64 | 
|  | bn.rshi  w15,  w7, w22  >> 28 | 
|  | bn.rshi  w21,  w7, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w7, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w7,  w0 | 
|  | bn.and   w21,  w7,  w1 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w0,  w1 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w3, w30  >> 64 | 
|  | bn.rshi  w18,  w3, w22  >> 14 | 
|  | bn.rshi  w21,  w3, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w3, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w3,  w4 | 
|  | bn.not   w21,  w3 | 
|  | bn.and   w21, w21,  w5 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w6 | 
|  | bn.add   w20, w20, w10  >> 64 | 
|  | bn.rshi  w21, w30, w11  >> 64 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w5[63:0] = h <= g */ | 
|  | /* w4[63:0] = g <= f */ | 
|  | /* w3[63:0] = f <= e */ | 
|  | /* w2[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add w2, w2, w20 | 
|  | /* w1[63:0] = d <= c */ | 
|  | /* w0[63:0] = c <= b */ | 
|  | /* w7[63:0] = b <= a */ | 
|  | /* w6[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add w6, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Process word 2 of loop cycle: t <= i*8 + 2. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w6, w30  >> 64 | 
|  | bn.rshi  w15,  w6, w22  >> 28 | 
|  | bn.rshi  w21,  w6, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w6, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w6, w7 | 
|  | bn.and   w21,  w6, w0 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w7, w0 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w2, w30  >> 64 | 
|  | bn.rshi  w18,  w2, w22  >> 14 | 
|  | bn.rshi  w21,  w2, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w2, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w2,  w3 | 
|  | bn.not   w21,  w2 | 
|  | bn.and   w21, w21,  w4 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w5 | 
|  | bn.add   w20, w20, w10  >> 128 | 
|  | bn.rshi  w21, w30, w11  >> 128 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w4[63:0] = h <= g */ | 
|  | /* w3[63:0] = g <= f */ | 
|  | /* w2[63:0] = f <= e */ | 
|  | /* w1[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add   w1,  w1, w20 | 
|  | /* w0[63:0] = d <= c */ | 
|  | /* w7[63:0] = c <= b */ | 
|  | /* w6[63:0] = b <= a */ | 
|  | /* w5[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add   w5, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Process word 3 of loop cycle: t <= i*8 + 3. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w5, w30  >> 64 | 
|  | bn.rshi  w15,  w5, w22  >> 28 | 
|  | bn.rshi  w21,  w5, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w5, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w5,  w6 | 
|  | bn.and   w21,  w5,  w7 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w6,  w7 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w1, w30  >> 64 | 
|  | bn.rshi  w18,  w1, w22  >> 14 | 
|  | bn.rshi  w21,  w1, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w1, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w1,  w2 | 
|  | bn.not   w21,  w1 | 
|  | bn.and   w21, w21,  w3 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w4 | 
|  | bn.add   w20, w20, w10  >> 192 | 
|  | bn.rshi  w21, w30, w11  >> 192 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w3[63:0] = h <= g */ | 
|  | /* w2[63:0] = g <= f */ | 
|  | /* w1[63:0] = f <= e */ | 
|  | /* w0[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add    w0,  w0, w20 | 
|  | /* w7[63:0] = d <= c */ | 
|  | /* w6[63:0] = c <= b */ | 
|  | /* w5[63:0] = b <= a */ | 
|  | /* w4[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add    w4, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Load another four round constants from dmem */ | 
|  | /* w10 <= [K_(i*8+7),K_(i*8+6),K_(i*8+5),K_(i*8+4)] = dmem[K + 2*i+1] */ | 
|  | bn.lid x10, 0(x15++) | 
|  |  | 
|  | /* Load another four message schedule words from dmem scratchpad */ | 
|  | /* w11 <= [W_(i*8+7),W_(i*8+6),W_(i*8+5),W_(i*8+4)] = dmem[W + 2*i+1] */ | 
|  | bn.lid x11, 0(x12++) | 
|  |  | 
|  |  | 
|  | /* Process word 4 of loop cycle: t <= i*8 + 3. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w4, w30  >> 64 | 
|  | bn.rshi  w15,  w4, w22  >> 28 | 
|  | bn.rshi  w21,  w4, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w4, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w4,  w5 | 
|  | bn.and   w21,  w4,  w6 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w5,  w6 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w0, w30  >> 64 | 
|  | bn.rshi  w18,  w0, w22  >> 14 | 
|  | bn.rshi  w21,  w0, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w0, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w0,  w1 | 
|  | bn.not   w21,  w0 | 
|  | bn.and   w21, w21,  w2 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w3 | 
|  | bn.add   w20, w20, w10  >> 0 | 
|  | bn.rshi  w21, w30, w11  >> 0 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w2[63:0] = h <= g */ | 
|  | /* w1[63:0] = g <= f */ | 
|  | /* w0[63:0] = f <= e */ | 
|  | /* w7[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add    w7,  w7, w20 | 
|  | /* w6[63:0] = d <= c */ | 
|  | /* w5[63:0] = c <= b */ | 
|  | /* w4[63:0] = b <= a */ | 
|  | /* w3[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add    w3, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Process word 5 of loop cycle: t <= i*8 + 3. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w3, w30  >> 64 | 
|  | bn.rshi  w15,  w3, w22  >> 28 | 
|  | bn.rshi  w21,  w3, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w3, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w3,  w4 | 
|  | bn.and   w21,  w3,  w5 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w4,  w5 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w7, w30  >> 64 | 
|  | bn.rshi  w18,  w7, w22  >> 14 | 
|  | bn.rshi  w21,  w7, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w7, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w7,  w0 | 
|  | bn.not   w21,  w7 | 
|  | bn.and   w21, w21,  w1 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w2 | 
|  | bn.add   w20, w20, w10  >> 64 | 
|  | bn.rshi  w21, w30, w11  >> 64 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w1[63:0] = h <= g */ | 
|  | /* w0[63:0] = g <= f */ | 
|  | /* w7[63:0] = f <= e */ | 
|  | /* w6[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add    w6,  w6, w20 | 
|  | /* w5[63:0] = d <= c */ | 
|  | /* w4[63:0] = c <= b */ | 
|  | /* w3[63:0] = b <= a */ | 
|  | /* w2[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add    w2, w20, w17 | 
|  |  | 
|  | /* Process word 6 of loop cycle: t <= i*8 + 3. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w2, w30  >> 64 | 
|  | bn.rshi  w15,  w2, w22  >> 28 | 
|  | bn.rshi  w21,  w2, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w2, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w2,  w3 | 
|  | bn.and   w21,  w2,  w4 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w3,  w4 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w6, w30  >> 64 | 
|  | bn.rshi  w18,  w6, w22  >> 14 | 
|  | bn.rshi  w21,  w6, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w6, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w6,  w7 | 
|  | bn.not   w21,  w6 | 
|  | bn.and   w21, w21,  w0 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w1 | 
|  | bn.add   w20, w20, w10  >> 128 | 
|  | bn.rshi  w21, w30, w11  >> 128 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w0[63:0] = h <= g */ | 
|  | /* w7[63:0] = g <= f */ | 
|  | /* w6[63:0] = f <= e */ | 
|  | /* w5[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add    w5,  w5, w20 | 
|  | /* w4[63:0] = d <= c */ | 
|  | /* w3[63:0] = c <= b */ | 
|  | /* w2[63:0] = b <= a */ | 
|  | /* w1[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add    w1, w20, w17 | 
|  |  | 
|  |  | 
|  | /* Process word 7 of loop cycle: t <= i*8 + 3. */ | 
|  |  | 
|  | /* w15[255:192] = S0(a) = (a ROTR 28) XOR (a ROTR 34) XOR (a ROTR 39) */ | 
|  | bn.rshi  w22,  w1, w30  >> 64 | 
|  | bn.rshi  w15,  w1, w22  >> 28 | 
|  | bn.rshi  w21,  w1, w22  >> 34 | 
|  | bn.xor   w15, w15, w21 | 
|  | bn.rshi  w21,  w1, w22  >> 39 | 
|  | bn.xor   w15, w15, w21 | 
|  |  | 
|  | /* w16[63:0] = Maj(a,b,c) = (a AND b) XOR (a AND c) XOR (b AND c) */ | 
|  | bn.and   w16,  w1,  w2 | 
|  | bn.and   w21,  w1,  w3 | 
|  | bn.xor   w16, w16, w21 | 
|  | bn.and   w21,  w2,  w3 | 
|  | bn.xor   w16, w16, w21 | 
|  |  | 
|  | /* w17[63:0] <= T2 = S0(a) + Maj(a,b,c) = w15[255:192] + w16[63:0] */ | 
|  | bn.rshi  w17, w30, w15  >> 192 | 
|  | bn.add   w17, w17, w16 | 
|  |  | 
|  | /* w18[255:192] <= S1(e) = (e ROTR 14) XOR (e ROTR 18) XOR (e ROTR 41) */ | 
|  | bn.rshi  w22,  w5, w30  >> 64 | 
|  | bn.rshi  w18,  w5, w22  >> 14 | 
|  | bn.rshi  w21,  w5, w22  >> 18 | 
|  | bn.xor   w18, w18, w21 | 
|  | bn.rshi  w19,  w5, w22  >> 41 | 
|  | bn.xor   w18, w18, w19 | 
|  |  | 
|  | /* w19[63:0] <= Ch(e,f,g) = (e AND f) XOR ((NOT e) AND g) */ | 
|  | bn.and   w19,  w5,  w6 | 
|  | bn.not   w21,  w5 | 
|  | bn.and   w21, w21,  w7 | 
|  | bn.xor   w19, w19, w21 | 
|  |  | 
|  | /* w20[63:0] <= T1 = h + S1(e) + Ch(e,f,g) + K_t + W_t */ | 
|  | bn.rshi  w20, w30, w18  >> 192 | 
|  | bn.add   w20, w20,  w0 | 
|  | bn.add   w20, w20, w10  >> 192 | 
|  | bn.rshi  w21, w30, w11  >> 192 | 
|  | bn.add   w21, w21, w19 | 
|  | bn.add   w20, w20, w21 | 
|  |  | 
|  | /* w7[63:0] = h <= g */ | 
|  | /* w6[63:0] = g <= f */ | 
|  | /* w5[63:0] = f <= e */ | 
|  | /* w4[63:0] = e <= d + T1 = w2[63:0] + w20[63:0] */ | 
|  | bn.add w4, w4, w20 | 
|  | /* w3[63:0] = d <= c */ | 
|  | /* w2[63:0] = c <= b */ | 
|  | /* w1[63:0] = b <= a */ | 
|  | /* w0[63:0] = a = T_1 + T_2 = w20[63:0] + w17[63:0] */ | 
|  | bn.add w0, w20, w17 | 
|  |  | 
|  |  | 
|  |  | 
|  | /* Add compressed chunk to current hash value */ | 
|  | addi      x2, x0, 15 | 
|  |  | 
|  | /* H_0 <= H_0 + a */ | 
|  | bn.lid    x2, 0(x17) | 
|  | bn.add   w15, w0, w15 | 
|  | bn.sid    x2, 0(x17) | 
|  |  | 
|  | /* H_1 <= H_1 + b */ | 
|  | bn.lid    x2, 32(x17) | 
|  | bn.add   w15, w1, w15 | 
|  | bn.sid    x2, 32(x17) | 
|  |  | 
|  | /* H_2 <= H_2 + c */ | 
|  | bn.lid    x2, 64(x17) | 
|  | bn.add   w15, w2, w15 | 
|  | bn.sid    x2, 64(x17) | 
|  |  | 
|  | /* H_3 <= H_3 + d */ | 
|  | bn.lid    x2, 96(x17) | 
|  | bn.add   w15, w3, w15 | 
|  | bn.sid    x2, 96(x17) | 
|  |  | 
|  | /* H_4 <= H_4 + e */ | 
|  | bn.lid    x2, 128(x17) | 
|  | bn.add   w15, w4, w15 | 
|  | bn.sid    x2, 128(x17) | 
|  |  | 
|  | /* H_5 <= H_5 + f */ | 
|  | bn.lid    x2, 160(x17) | 
|  | bn.add   w15, w5, w15 | 
|  | bn.sid    x2, 160(x17) | 
|  |  | 
|  | /* H_6 <= H_6 + g */ | 
|  | bn.lid    x2, 192(x17) | 
|  | bn.add   w15, w6, w15 | 
|  | bn.sid    x2, 192(x17) | 
|  |  | 
|  | /* H_7 <= H_7 + h */ | 
|  | bn.lid    x2, 224(x17) | 
|  | bn.add   w15, w7, w15 | 
|  | bn.sid    x2, 224(x17) | 
|  |  | 
|  | ret | 
|  |  | 
|  | .bss | 
|  |  | 
|  | /* number of chunks to process */ | 
|  | .globl n_chunks | 
|  | .balign 4 | 
|  | n_chunks: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to state (dptr_state) */ | 
|  | .globl dptr_state | 
|  | .balign 4 | 
|  | dptr_state: | 
|  | .zero 4 | 
|  |  | 
|  | /* pointer to msg (dptr_msg) */ | 
|  | .globl dptr_msg | 
|  | .balign 4 | 
|  | dptr_msg: | 
|  | .zero 4 | 
|  |  | 
|  |  | 
|  | /* 80*8=640 bytes scratchpad for message schedule */ | 
|  | .section .scratchpad | 
|  | .balign 32 | 
|  | W: | 
|  | .zero 640 | 
|  |  | 
|  | .data | 
|  |  | 
|  | /* SHA-512 round constants */ | 
|  | .balign 32 | 
|  | K: | 
|  | .dword 0x428a2f98d728ae22 | 
|  | .dword 0x7137449123ef65cd | 
|  | .dword 0xb5c0fbcfec4d3b2f | 
|  | .dword 0xe9b5dba58189dbbc | 
|  | .dword 0x3956c25bf348b538 | 
|  | .dword 0x59f111f1b605d019 | 
|  | .dword 0x923f82a4af194f9b | 
|  | .dword 0xab1c5ed5da6d8118 | 
|  | .dword 0xd807aa98a3030242 | 
|  | .dword 0x12835b0145706fbe | 
|  | .dword 0x243185be4ee4b28c | 
|  | .dword 0x550c7dc3d5ffb4e2 | 
|  | .dword 0x72be5d74f27b896f | 
|  | .dword 0x80deb1fe3b1696b1 | 
|  | .dword 0x9bdc06a725c71235 | 
|  | .dword 0xc19bf174cf692694 | 
|  | .dword 0xe49b69c19ef14ad2 | 
|  | .dword 0xefbe4786384f25e3 | 
|  | .dword 0x0fc19dc68b8cd5b5 | 
|  | .dword 0x240ca1cc77ac9c65 | 
|  | .dword 0x2de92c6f592b0275 | 
|  | .dword 0x4a7484aa6ea6e483 | 
|  | .dword 0x5cb0a9dcbd41fbd4 | 
|  | .dword 0x76f988da831153b5 | 
|  | .dword 0x983e5152ee66dfab | 
|  | .dword 0xa831c66d2db43210 | 
|  | .dword 0xb00327c898fb213f | 
|  | .dword 0xbf597fc7beef0ee4 | 
|  | .dword 0xc6e00bf33da88fc2 | 
|  | .dword 0xd5a79147930aa725 | 
|  | .dword 0x06ca6351e003826f | 
|  | .dword 0x142929670a0e6e70 | 
|  | .dword 0x27b70a8546d22ffc | 
|  | .dword 0x2e1b21385c26c926 | 
|  | .dword 0x4d2c6dfc5ac42aed | 
|  | .dword 0x53380d139d95b3df | 
|  | .dword 0x650a73548baf63de | 
|  | .dword 0x766a0abb3c77b2a8 | 
|  | .dword 0x81c2c92e47edaee6 | 
|  | .dword 0x92722c851482353b | 
|  | .dword 0xa2bfe8a14cf10364 | 
|  | .dword 0xa81a664bbc423001 | 
|  | .dword 0xc24b8b70d0f89791 | 
|  | .dword 0xc76c51a30654be30 | 
|  | .dword 0xd192e819d6ef5218 | 
|  | .dword 0xd69906245565a910 | 
|  | .dword 0xf40e35855771202a | 
|  | .dword 0x106aa07032bbd1b8 | 
|  | .dword 0x19a4c116b8d2d0c8 | 
|  | .dword 0x1e376c085141ab53 | 
|  | .dword 0x2748774cdf8eeb99 | 
|  | .dword 0x34b0bcb5e19b48a8 | 
|  | .dword 0x391c0cb3c5c95a63 | 
|  | .dword 0x4ed8aa4ae3418acb | 
|  | .dword 0x5b9cca4f7763e373 | 
|  | .dword 0x682e6ff3d6b2b8a3 | 
|  | .dword 0x748f82ee5defb2fc | 
|  | .dword 0x78a5636f43172f60 | 
|  | .dword 0x84c87814a1f0ab72 | 
|  | .dword 0x8cc702081a6439ec | 
|  | .dword 0x90befffa23631e28 | 
|  | .dword 0xa4506cebde82bde9 | 
|  | .dword 0xbef9a3f7b2c67915 | 
|  | .dword 0xc67178f2e372532b | 
|  | .dword 0xca273eceea26619c | 
|  | .dword 0xd186b8c721c0c207 | 
|  | .dword 0xeada7dd6cde0eb1e | 
|  | .dword 0xf57d4f7fee6ed178 | 
|  | .dword 0x06f067aa72176fba | 
|  | .dword 0x0a637dc5a2c898a6 | 
|  | .dword 0x113f9804bef90dae | 
|  | .dword 0x1b710b35131c471b | 
|  | .dword 0x28db77f523047d84 | 
|  | .dword 0x32caab7b40c72493 | 
|  | .dword 0x3c9ebe0a15c9bebc | 
|  | .dword 0x431d67c49c100d4c | 
|  | .dword 0x4cc5d4becb3e42b6 | 
|  | .dword 0x597f299cfc657e2a | 
|  | .dword 0x5fcb6fab3ad6faec | 
|  | .dword 0x6c44198c4a475817 |