LCOV - code coverage report
Current view: top level - src/crypto - sha256_arm_shani.cpp (source / functions) Hit Total Coverage
Test: total_coverage.info Lines: 0 646 0.0 %
Date: 2026-06-25 07:23:43 Functions: 0 2 0.0 %

          Line data    Source code
       1             : // Copyright (c) 2022 The Bitcoin Core developers
       2             : // Distributed under the MIT software license, see the accompanying
       3             : // file COPYING or http://www.opensource.org/licenses/mit-license.php.
       4             : //
       5             : // Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
       6             : // Written and placed in public domain by Jeffrey Walton.
       7             : // Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
       8             : // Barry O'Rourke for the mbedTLS project.
       9             : // Variant specialized for 64-byte inputs added by Pieter Wuille.
      10             : 
      11             : #ifdef ENABLE_ARM_SHANI
      12             : 
      13             : #include <array>
      14             : #include <cstdint>
      15             : #include <cstddef>
      16             : #include <arm_acle.h>
      17             : #include <arm_neon.h>
      18             : 
      19             : namespace {
      20             : alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
      21             : {
      22             :     0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
      23             :     0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
      24             :     0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
      25             :     0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
      26             :     0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
      27             :     0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
      28             :     0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
      29             :     0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
      30             :     0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
      31             :     0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
      32             :     0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
      33             :     0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
      34             :     0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
      35             :     0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
      36             :     0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
      37             :     0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
      38             : };
      39             : }
      40             : 
      41             : namespace sha256_arm_shani {
      42           0 : void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
      43             : {
      44             :     uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
      45             :     uint32x4_t MSG0, MSG1, MSG2, MSG3;
      46             :     uint32x4_t TMP0, TMP2;
      47             : 
      48             :     // Load state
      49           0 :     STATE0 = vld1q_u32(&s[0]);
      50           0 :     STATE1 = vld1q_u32(&s[4]);
      51             : 
      52           0 :     while (blocks--)
      53             :     {
      54             :         // Save state
      55           0 :         ABEF_SAVE = STATE0;
      56           0 :         CDGH_SAVE = STATE1;
      57             : 
      58             :         // Load and convert input chunk to Big Endian
      59           0 :         MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
      60           0 :         MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
      61           0 :         MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
      62           0 :         MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
      63           0 :         chunk += 64;
      64             : 
      65             :         // Original implementation preloaded message and constant addition which was 1-3% slower.
      66             :         // Now included as first step in quad round code saving one Q Neon register
      67             :         // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
      68             : 
      69             :         // Rounds 1-4
      70           0 :         TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
      71           0 :         TMP2 = STATE0;
      72           0 :         MSG0 = vsha256su0q_u32(MSG0, MSG1);
      73           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
      74           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
      75           0 :         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
      76             : 
      77             :         // Rounds 5-8
      78           0 :         TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
      79           0 :         TMP2 = STATE0;
      80           0 :         MSG1 = vsha256su0q_u32(MSG1, MSG2);
      81           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
      82           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
      83           0 :         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
      84             : 
      85             :         // Rounds 9-12
      86           0 :         TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
      87           0 :         TMP2 = STATE0;
      88           0 :         MSG2 = vsha256su0q_u32(MSG2, MSG3);
      89           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
      90           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
      91           0 :         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
      92             : 
      93             :         // Rounds 13-16
      94           0 :         TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
      95           0 :         TMP2 = STATE0;
      96           0 :         MSG3 = vsha256su0q_u32(MSG3, MSG0);
      97           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
      98           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
      99           0 :         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
     100             : 
     101             :         // Rounds 17-20
     102           0 :         TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
     103           0 :         TMP2 = STATE0;
     104           0 :         MSG0 = vsha256su0q_u32(MSG0, MSG1);
     105           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     106           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     107           0 :         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
     108             : 
     109             :         // Rounds 21-24
     110           0 :         TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
     111           0 :         TMP2 = STATE0;
     112           0 :         MSG1 = vsha256su0q_u32(MSG1, MSG2);
     113           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     114           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     115           0 :         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
     116             : 
     117             :         // Rounds 25-28
     118           0 :         TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
     119           0 :         TMP2 = STATE0;
     120           0 :         MSG2 = vsha256su0q_u32(MSG2, MSG3);
     121           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     122           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     123           0 :         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
     124             : 
     125             :         // Rounds 29-32
     126           0 :         TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
     127           0 :         TMP2 = STATE0;
     128           0 :         MSG3 = vsha256su0q_u32(MSG3, MSG0);
     129           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     130           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     131           0 :         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
     132             : 
     133             :         // Rounds 33-36
     134           0 :         TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
     135           0 :         TMP2 = STATE0;
     136           0 :         MSG0 = vsha256su0q_u32(MSG0, MSG1);
     137           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     138           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     139           0 :         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
     140             : 
     141             :         // Rounds 37-40
     142           0 :         TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
     143           0 :         TMP2 = STATE0;
     144           0 :         MSG1 = vsha256su0q_u32(MSG1, MSG2);
     145           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     146           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     147           0 :         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
     148             : 
     149             :         // Rounds 41-44
     150           0 :         TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
     151           0 :         TMP2 = STATE0;
     152           0 :         MSG2 = vsha256su0q_u32(MSG2, MSG3);
     153           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     154           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     155           0 :         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
     156             : 
     157             :         // Rounds 45-48
     158           0 :         TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
     159           0 :         TMP2 = STATE0;
     160           0 :         MSG3 = vsha256su0q_u32(MSG3, MSG0);
     161           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     162           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     163           0 :         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
     164             : 
     165             :         // Rounds 49-52
     166           0 :         TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
     167           0 :         TMP2 = STATE0;
     168           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     169           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     170             : 
     171             :         // Rounds 53-56
     172           0 :         TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
     173           0 :         TMP2 = STATE0;
     174           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     175           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     176             : 
     177             :         // Rounds 57-60
     178           0 :         TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
     179           0 :         TMP2 = STATE0;
     180           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     181           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     182             : 
     183             :         // Rounds 61-64
     184           0 :         TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
     185           0 :         TMP2 = STATE0;
     186           0 :         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
     187           0 :         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
     188             : 
     189             :         // Update state
     190           0 :         STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
     191           0 :         STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
     192             :     }
     193             : 
     194             :     // Save final state
     195           0 :     vst1q_u32(&s[0], STATE0);
     196           0 :     vst1q_u32(&s[4], STATE1);
     197           0 : }
     198             : }
     199             : 
     200             : namespace sha256d64_arm_shani {
     201           0 : void Transform_2way(unsigned char* output, const unsigned char* input)
     202             : {
     203             :     /* Initial state. */
     204             :     alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
     205             :         0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
     206             :         0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
     207             :     };
     208             : 
     209             :     /* Precomputed message schedule for the 2nd transform. */
     210             :     alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
     211             :         0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     212             :         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     213             :         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     214             :         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
     215             :         0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
     216             :         0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
     217             :         0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
     218             :         0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
     219             :         0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
     220             :         0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
     221             :         0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
     222             :         0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
     223             :         0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
     224             :         0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
     225             :         0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
     226             :         0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
     227             :     };
     228             : 
     229             :     /* A few precomputed message schedule values for the 3rd transform. */
     230             :     alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
     231             :         0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     232             :         0x80000000, 0x00000000, 0x00000000, 0x00000000,
     233             :         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
     234             :     };
     235             : 
     236             :     /* Padding processed in the 3rd transform (byteswapped). */
     237             :     alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
     238             : 
     239             :     uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
     240             :     uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
     241             :     uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
     242             : 
     243             :     // Transform 1: Load state
     244           0 :     STATE0A = vld1q_u32(&INIT[0]);
     245           0 :     STATE0B = STATE0A;
     246           0 :     STATE1A = vld1q_u32(&INIT[4]);
     247           0 :     STATE1B = STATE1A;
     248             : 
     249             :     // Transform 1: Load and convert input chunk to Big Endian
     250           0 :     MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
     251           0 :     MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
     252           0 :     MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
     253           0 :     MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
     254           0 :     MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
     255           0 :     MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
     256           0 :     MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
     257           0 :     MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
     258             : 
     259             :     // Transform 1: Rounds 1-4
     260           0 :     TMP = vld1q_u32(&K[0]);
     261           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     262           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     263           0 :     TMP2A = STATE0A;
     264           0 :     TMP2B = STATE0B;
     265           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     266           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     267           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     268           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     269           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     270           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     271           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     272           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     273             : 
     274             :     // Transform 1: Rounds 5-8
     275           0 :     TMP = vld1q_u32(&K[4]);
     276           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     277           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     278           0 :     TMP2A = STATE0A;
     279           0 :     TMP2B = STATE0B;
     280           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     281           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     282           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     283           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     284           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     285           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     286           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     287           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     288             : 
     289             :     // Transform 1: Rounds 9-12
     290           0 :     TMP = vld1q_u32(&K[8]);
     291           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     292           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     293           0 :     TMP2A = STATE0A;
     294           0 :     TMP2B = STATE0B;
     295           0 :     MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
     296           0 :     MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
     297           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     298           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     299           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     300           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     301           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     302           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     303             : 
     304             :     // Transform 1: Rounds 13-16
     305           0 :     TMP = vld1q_u32(&K[12]);
     306           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     307           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     308           0 :     TMP2A = STATE0A;
     309           0 :     TMP2B = STATE0B;
     310           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     311           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     312           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     313           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     314           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     315           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     316           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     317           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     318             : 
     319             :     // Transform 1: Rounds 17-20
     320           0 :     TMP = vld1q_u32(&K[16]);
     321           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     322           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     323           0 :     TMP2A = STATE0A;
     324           0 :     TMP2B = STATE0B;
     325           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     326           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     327           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     328           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     329           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     330           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     331           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     332           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     333             : 
     334             :     // Transform 1: Rounds 21-24
     335           0 :     TMP = vld1q_u32(&K[20]);
     336           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     337           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     338           0 :     TMP2A = STATE0A;
     339           0 :     TMP2B = STATE0B;
     340           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     341           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     342           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     343           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     344           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     345           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     346           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     347           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     348             : 
     349             :     // Transform 1: Rounds 25-28
     350           0 :     TMP = vld1q_u32(&K[24]);
     351           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     352           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     353           0 :     TMP2A = STATE0A;
     354           0 :     TMP2B = STATE0B;
     355           0 :     MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
     356           0 :     MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
     357           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     358           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     359           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     360           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     361           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     362           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     363             : 
     364             :     // Transform 1: Rounds 29-32
     365           0 :     TMP = vld1q_u32(&K[28]);
     366           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     367           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     368           0 :     TMP2A = STATE0A;
     369           0 :     TMP2B = STATE0B;
     370           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     371           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     372           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     373           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     374           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     375           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     376           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     377           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     378             : 
     379             :     // Transform 1: Rounds 33-36
     380           0 :     TMP = vld1q_u32(&K[32]);
     381           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     382           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     383           0 :     TMP2A = STATE0A;
     384           0 :     TMP2B = STATE0B;
     385           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     386           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     387           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     388           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     389           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     390           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     391           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     392           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     393             : 
     394             :     // Transform 1: Rounds 37-40
     395           0 :     TMP = vld1q_u32(&K[36]);
     396           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     397           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     398           0 :     TMP2A = STATE0A;
     399           0 :     TMP2B = STATE0B;
     400           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     401           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     402           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     403           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     404           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     405           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     406           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     407           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     408             : 
     409             :     // Transform 1: Rounds 41-44
     410           0 :     TMP = vld1q_u32(&K[40]);
     411           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     412           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     413           0 :     TMP2A = STATE0A;
     414           0 :     TMP2B = STATE0B;
     415           0 :     MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
     416           0 :     MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
     417           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     418           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     419           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     420           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     421           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     422           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     423             : 
     424             :     // Transform 1: Rounds 45-48
     425           0 :     TMP = vld1q_u32(&K[44]);
     426           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     427           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     428           0 :     TMP2A = STATE0A;
     429           0 :     TMP2B = STATE0B;
     430           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     431           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     432           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     433           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     434           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     435           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     436           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     437           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     438             : 
     439             :     // Transform 1: Rounds 49-52
     440           0 :     TMP = vld1q_u32(&K[48]);
     441           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     442           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     443           0 :     TMP2A = STATE0A;
     444           0 :     TMP2B = STATE0B;
     445           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     446           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     447           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     448           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     449             : 
     450             :     // Transform 1: Rounds 53-56
     451           0 :     TMP = vld1q_u32(&K[52]);
     452           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     453           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     454           0 :     TMP2A = STATE0A;
     455           0 :     TMP2B = STATE0B;
     456           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     457           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     458           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     459           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     460             : 
     461             :     // Transform 1: Rounds 57-60
     462           0 :     TMP = vld1q_u32(&K[56]);
     463           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     464           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     465           0 :     TMP2A = STATE0A;
     466           0 :     TMP2B = STATE0B;
     467           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     468           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     469           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     470           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     471             : 
     472             :     // Transform 1: Rounds 61-64
     473           0 :     TMP = vld1q_u32(&K[60]);
     474           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     475           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     476           0 :     TMP2A = STATE0A;
     477           0 :     TMP2B = STATE0B;
     478           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     479           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     480           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     481           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     482             : 
     483             :     // Transform 1: Update state
     484           0 :     TMP = vld1q_u32(&INIT[0]);
     485           0 :     STATE0A = vaddq_u32(STATE0A, TMP);
     486           0 :     STATE0B = vaddq_u32(STATE0B, TMP);
     487           0 :     TMP = vld1q_u32(&INIT[4]);
     488           0 :     STATE1A = vaddq_u32(STATE1A, TMP);
     489           0 :     STATE1B = vaddq_u32(STATE1B, TMP);
     490             : 
     491             :     // Transform 2: Save state
     492           0 :     ABEF_SAVEA = STATE0A;
     493           0 :     ABEF_SAVEB = STATE0B;
     494           0 :     CDGH_SAVEA = STATE1A;
     495           0 :     CDGH_SAVEB = STATE1B;
     496             : 
     497             :     // Transform 2: Rounds 1-4
     498           0 :     TMP = vld1q_u32(&MIDS[0]);
     499           0 :     TMP2A = STATE0A;
     500           0 :     TMP2B = STATE0B;
     501           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     502           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     503           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     504           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     505             : 
     506             :     // Transform 2: Rounds 5-8
     507           0 :     TMP = vld1q_u32(&MIDS[4]);
     508           0 :     TMP2A = STATE0A;
     509           0 :     TMP2B = STATE0B;
     510           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     511           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     512           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     513           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     514             : 
     515             :     // Transform 2: Rounds 9-12
     516           0 :     TMP = vld1q_u32(&MIDS[8]);
     517           0 :     TMP2A = STATE0A;
     518           0 :     TMP2B = STATE0B;
     519           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     520           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     521           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     522           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     523             : 
     524             :     // Transform 2: Rounds 13-16
     525           0 :     TMP = vld1q_u32(&MIDS[12]);
     526           0 :     TMP2A = STATE0A;
     527           0 :     TMP2B = STATE0B;
     528           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     529           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     530           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     531           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     532             : 
     533             :     // Transform 2: Rounds 17-20
     534           0 :     TMP = vld1q_u32(&MIDS[16]);
     535           0 :     TMP2A = STATE0A;
     536           0 :     TMP2B = STATE0B;
     537           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     538           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     539           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     540           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     541             : 
     542             :     // Transform 2: Rounds 21-24
     543           0 :     TMP = vld1q_u32(&MIDS[20]);
     544           0 :     TMP2A = STATE0A;
     545           0 :     TMP2B = STATE0B;
     546           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     547           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     548           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     549           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     550             : 
     551             :     // Transform 2: Rounds 25-28
     552           0 :     TMP = vld1q_u32(&MIDS[24]);
     553           0 :     TMP2A = STATE0A;
     554           0 :     TMP2B = STATE0B;
     555           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     556           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     557           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     558           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     559             : 
     560             :     // Transform 2: Rounds 29-32
     561           0 :     TMP = vld1q_u32(&MIDS[28]);
     562           0 :     TMP2A = STATE0A;
     563           0 :     TMP2B = STATE0B;
     564           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     565           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     566           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     567           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     568             : 
     569             :     // Transform 2: Rounds 33-36
     570           0 :     TMP = vld1q_u32(&MIDS[32]);
     571           0 :     TMP2A = STATE0A;
     572           0 :     TMP2B = STATE0B;
     573           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     574           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     575           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     576           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     577             : 
     578             :     // Transform 2: Rounds 37-40
     579           0 :     TMP = vld1q_u32(&MIDS[36]);
     580           0 :     TMP2A = STATE0A;
     581           0 :     TMP2B = STATE0B;
     582           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     583           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     584           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     585           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     586             : 
     587             :     // Transform 2: Rounds 41-44
     588           0 :     TMP = vld1q_u32(&MIDS[40]);
     589           0 :     TMP2A = STATE0A;
     590           0 :     TMP2B = STATE0B;
     591           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     592           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     593           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     594           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     595             : 
     596             :     // Transform 2: Rounds 45-48
     597           0 :     TMP = vld1q_u32(&MIDS[44]);
     598           0 :     TMP2A = STATE0A;
     599           0 :     TMP2B = STATE0B;
     600           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     601           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     602           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     603           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     604             : 
     605             :     // Transform 2: Rounds 49-52
     606           0 :     TMP = vld1q_u32(&MIDS[48]);
     607           0 :     TMP2A = STATE0A;
     608           0 :     TMP2B = STATE0B;
     609           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     610           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     611           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     612           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     613             : 
     614             :     // Transform 2: Rounds 53-56
     615           0 :     TMP = vld1q_u32(&MIDS[52]);
     616           0 :     TMP2A = STATE0A;
     617           0 :     TMP2B = STATE0B;
     618           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     619           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     620           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     621           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     622             : 
     623             :     // Transform 2: Rounds 57-60
     624           0 :     TMP = vld1q_u32(&MIDS[56]);
     625           0 :     TMP2A = STATE0A;
     626           0 :     TMP2B = STATE0B;
     627           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     628           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     629           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     630           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     631             : 
     632             :     // Transform 2: Rounds 61-64
     633           0 :     TMP = vld1q_u32(&MIDS[60]);
     634           0 :     TMP2A = STATE0A;
     635           0 :     TMP2B = STATE0B;
     636           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     637           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     638           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     639           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     640             : 
     641             :     // Transform 2: Update state
     642           0 :     STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
     643           0 :     STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
     644           0 :     STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
     645           0 :     STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
     646             : 
     647             :     // Transform 3: Pad previous output
     648           0 :     MSG0A = STATE0A;
     649           0 :     MSG0B = STATE0B;
     650           0 :     MSG1A = STATE1A;
     651           0 :     MSG1B = STATE1B;
     652           0 :     MSG2A = vld1q_u32(&FINAL[0]);
     653           0 :     MSG2B = MSG2A;
     654           0 :     MSG3A = vld1q_u32(&FINAL[4]);
     655           0 :     MSG3B = MSG3A;
     656             : 
     657             :     // Transform 3: Load state
     658           0 :     STATE0A = vld1q_u32(&INIT[0]);
     659           0 :     STATE0B = STATE0A;
     660           0 :     STATE1A = vld1q_u32(&INIT[4]);
     661           0 :     STATE1B = STATE1A;
     662             : 
     663             :     // Transform 3: Rounds 1-4
     664           0 :     TMP = vld1q_u32(&K[0]);
     665           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     666           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     667           0 :     TMP2A = STATE0A;
     668           0 :     TMP2B = STATE0B;
     669           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     670           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     671           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     672           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     673           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     674           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     675           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     676           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     677             : 
     678             :     // Transform 3: Rounds 5-8
     679           0 :     TMP = vld1q_u32(&K[4]);
     680           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     681           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     682           0 :     TMP2A = STATE0A;
     683           0 :     TMP2B = STATE0B;
     684           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     685           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     686           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     687           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     688           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     689           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     690           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     691           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     692             : 
     693             :     // Transform 3: Rounds 9-12
     694           0 :     TMP = vld1q_u32(&FINS[0]);
     695           0 :     TMP2A = STATE0A;
     696           0 :     TMP2B = STATE0B;
     697           0 :     MSG2A = vld1q_u32(&FINS[4]);
     698           0 :     MSG2B = MSG2A;
     699           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     700           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     701           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     702           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     703           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     704           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     705             : 
     706             :     // Transform 3: Rounds 13-16
     707           0 :     TMP = vld1q_u32(&FINS[8]);
     708           0 :     TMP2A = STATE0A;
     709           0 :     TMP2B = STATE0B;
     710           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     711           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     712           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
     713           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
     714           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
     715           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
     716           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     717           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     718             : 
     719             :     // Transform 3: Rounds 17-20
     720           0 :     TMP = vld1q_u32(&K[16]);
     721           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     722           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     723           0 :     TMP2A = STATE0A;
     724           0 :     TMP2B = STATE0B;
     725           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     726           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     727           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     728           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     729           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     730           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     731           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     732           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     733             : 
     734             :     // Transform 3: Rounds 21-24
     735           0 :     TMP = vld1q_u32(&K[20]);
     736           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     737           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     738           0 :     TMP2A = STATE0A;
     739           0 :     TMP2B = STATE0B;
     740           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     741           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     742           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     743           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     744           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     745           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     746           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     747           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     748             : 
     749             :     // Transform 3: Rounds 25-28
     750           0 :     TMP = vld1q_u32(&K[24]);
     751           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     752           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     753           0 :     TMP2A = STATE0A;
     754           0 :     TMP2B = STATE0B;
     755           0 :     MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
     756           0 :     MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
     757           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     758           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     759           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     760           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     761           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     762           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     763             : 
     764             :     // Transform 3: Rounds 29-32
     765           0 :     TMP = vld1q_u32(&K[28]);
     766           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     767           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     768           0 :     TMP2A = STATE0A;
     769           0 :     TMP2B = STATE0B;
     770           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     771           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     772           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     773           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     774           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     775           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     776           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     777           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     778             : 
     779             :     // Transform 3: Rounds 33-36
     780           0 :     TMP = vld1q_u32(&K[32]);
     781           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     782           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     783           0 :     TMP2A = STATE0A;
     784           0 :     TMP2B = STATE0B;
     785           0 :     MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
     786           0 :     MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
     787           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     788           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     789           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     790           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     791           0 :     MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
     792           0 :     MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
     793             : 
     794             :     // Transform 3: Rounds 37-40
     795           0 :     TMP = vld1q_u32(&K[36]);
     796           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     797           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     798           0 :     TMP2A = STATE0A;
     799           0 :     TMP2B = STATE0B;
     800           0 :     MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
     801           0 :     MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
     802           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     803           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     804           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     805           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     806           0 :     MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
     807           0 :     MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
     808             : 
     809             :     // Transform 3: Rounds 41-44
     810           0 :     TMP = vld1q_u32(&K[40]);
     811           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     812           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     813           0 :     TMP2A = STATE0A;
     814           0 :     TMP2B = STATE0B;
     815           0 :     MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
     816           0 :     MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
     817           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     818           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     819           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     820           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     821           0 :     MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
     822           0 :     MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
     823             : 
     824             :     // Transform 3: Rounds 45-48
     825           0 :     TMP = vld1q_u32(&K[44]);
     826           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     827           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     828           0 :     TMP2A = STATE0A;
     829           0 :     TMP2B = STATE0B;
     830           0 :     MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
     831           0 :     MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
     832           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     833           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     834           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     835           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     836           0 :     MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
     837           0 :     MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
     838             : 
     839             :     // Transform 3: Rounds 49-52
     840           0 :     TMP = vld1q_u32(&K[48]);
     841           0 :     TMP0A = vaddq_u32(MSG0A, TMP);
     842           0 :     TMP0B = vaddq_u32(MSG0B, TMP);
     843           0 :     TMP2A = STATE0A;
     844           0 :     TMP2B = STATE0B;
     845           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     846           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     847           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     848           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     849             : 
     850             :     // Transform 3: Rounds 53-56
     851           0 :     TMP = vld1q_u32(&K[52]);
     852           0 :     TMP0A = vaddq_u32(MSG1A, TMP);
     853           0 :     TMP0B = vaddq_u32(MSG1B, TMP);
     854           0 :     TMP2A = STATE0A;
     855           0 :     TMP2B = STATE0B;
     856           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     857           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     858           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     859           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     860             : 
     861             :     // Transform 3: Rounds 57-60
     862           0 :     TMP = vld1q_u32(&K[56]);
     863           0 :     TMP0A = vaddq_u32(MSG2A, TMP);
     864           0 :     TMP0B = vaddq_u32(MSG2B, TMP);
     865           0 :     TMP2A = STATE0A;
     866           0 :     TMP2B = STATE0B;
     867           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     868           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     869           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     870           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     871             : 
     872             :     // Transform 3: Rounds 61-64
     873           0 :     TMP = vld1q_u32(&K[60]);
     874           0 :     TMP0A = vaddq_u32(MSG3A, TMP);
     875           0 :     TMP0B = vaddq_u32(MSG3B, TMP);
     876           0 :     TMP2A = STATE0A;
     877           0 :     TMP2B = STATE0B;
     878           0 :     STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
     879           0 :     STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
     880           0 :     STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
     881           0 :     STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
     882             : 
     883             :     // Transform 3: Update state
     884           0 :     TMP = vld1q_u32(&INIT[0]);
     885           0 :     STATE0A = vaddq_u32(STATE0A, TMP);
     886           0 :     STATE0B = vaddq_u32(STATE0B, TMP);
     887           0 :     TMP = vld1q_u32(&INIT[4]);
     888           0 :     STATE1A = vaddq_u32(STATE1A, TMP);
     889           0 :     STATE1B = vaddq_u32(STATE1B, TMP);
     890             : 
     891             :     // Store result
     892           0 :     vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
     893           0 :     vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
     894           0 :     vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
     895           0 :     vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
     896           0 : }
     897             : }
     898             : 
     899             : #endif

Generated by: LCOV version 1.16