From 6eca7839af74c5ce2de62bc38cec2769bb9de360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20D=C3=BCsterhus?= Date: Thu, 8 Aug 2024 22:19:33 +0200 Subject: [PATCH] hash: Add SHA-NI implementation of SHA-256 (#15152) * hash: Add SSE2 implementation of SHA-256 Implementation taken from tarsnap/libcperciva@661752aee82dcc8070754e3e539fdc7782bd3942. Co-authored-by: Christoph M. Becker Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com> * zend_cpuinfo: Add ZEND_CPU_FEATURE_SHA * hash: Add SHA-NI implementation of SHA-256 Implementation taken from tarsnap/libcperciva@661752aee82dcc8070754e3e539fdc7782bd3942. Co-authored-by: Christoph M. Becker * NEWS / UPGRADING --------- Co-authored-by: Christoph M. Becker Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com> --- NEWS | 2 + README.REDIST.BINS | 2 +- UPGRADING | 4 + Zend/zend_cpuinfo.h | 1 + ext/hash/config.m4 | 2 +- ext/hash/config.w32 | 2 +- ext/hash/hash_sha.c | 19 +++ ext/hash/hash_sha_ni.c | 176 +++++++++++++++++++++++++++ ext/hash/hash_sha_sse2.c | 257 +++++++++++++++++++++++++++++++++++++++ ext/hash/php_hash_sha.h | 21 ++++ 10 files changed, 483 insertions(+), 3 deletions(-) create mode 100644 ext/hash/hash_sha_ni.c create mode 100644 ext/hash/hash_sha_sse2.c diff --git a/NEWS b/NEWS index 15ee2b5c908..24ddfa6c20a 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,8 @@ PHP NEWS - Hash: . Deprecated passing incorrect data types for options to ext/hash functions. (nielsdos) + . Added SSE2 and SHA-NI implementation of SHA-256. (timwolla, Colin Percival, + Graham Percival) - PHPDBG: . array out of bounds, stack overflow handled for segfault handler on windows. diff --git a/README.REDIST.BINS b/README.REDIST.BINS index 29d41b9d2d6..14a93d5e294 100644 --- a/README.REDIST.BINS +++ b/README.REDIST.BINS @@ -18,7 +18,7 @@ 18. avifinfo (ext/standard/libavifinfo) see ext/standard/libavifinfo/LICENSE 19. xxHash (ext/hash/xxhash) 20. Lexbor (ext/dom/lexbor/lexbor) see ext/dom/lexbor/LICENSE - +21. Portions of libcperciva (ext/hash/hash_sha_{ni,sse2}.c) see the header in the source file 3. pcre2lib (ext/pcre) diff --git a/UPGRADING b/UPGRADING index c35e889bab4..8044e34483d 100644 --- a/UPGRADING +++ b/UPGRADING @@ -955,6 +955,10 @@ PHP 8.4 UPGRADE NOTES . Improved the performance of FTP uploads up to a factor of 10x for large uploads. +- Hash: + . Added SSE2 and SHA-NI implementations of SHA-256. This improves the performance + on supported CPUs by ~1.3x (SSE2) and 3x - 5x (SHA-NI). + - MBString: . The performance of strspn() and strcspn() is greatly improved. They now run in linear time instead of being bounded by quadratic time. diff --git a/Zend/zend_cpuinfo.h b/Zend/zend_cpuinfo.h index 9537df67b24..c81865c4a4d 100644 --- a/Zend/zend_cpuinfo.h +++ b/Zend/zend_cpuinfo.h @@ -64,6 +64,7 @@ typedef enum _zend_cpu_feature { ZEND_CPU_FEATURE_AVX512F = (1<<16 | ZEND_CPU_EBX_MASK), ZEND_CPU_FEATURE_AVX512DQ = (1<<17 | ZEND_CPU_EBX_MASK), ZEND_CPU_FEATURE_AVX512CD = (1<<28 | ZEND_CPU_EBX_MASK), + ZEND_CPU_FEATURE_SHA = (1<<29 | ZEND_CPU_EBX_MASK), /* intentionally don't support = (1<<30 | ZEND_CPU_EBX_MASK) */ /* intentionally don't support = (1<<31 | ZEND_CPU_EBX_MASK) */ diff --git a/ext/hash/config.m4 b/ext/hash/config.m4 index f5608183504..ed4dbbf3aef 100644 --- a/ext/hash/config.m4 +++ b/ext/hash/config.m4 @@ -34,7 +34,7 @@ else PHP_HASH_CFLAGS="$PHP_HASH_CFLAGS -I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1" fi -EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c \ +EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c \ hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \ hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES murmur/PMurHash.c murmur/PMurHash128.c hash_murmur.c hash_xxhash.c" diff --git a/ext/hash/config.w32 b/ext/hash/config.w32 index e574026ac0d..dd4c050bcd2 100644 --- a/ext/hash/config.w32 +++ b/ext/hash/config.w32 @@ -9,7 +9,7 @@ if (PHP_MHASH != 'no') { PHP_HASH = 'yes'; -EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c ' + +EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_sha_sse2.c hash_sha_ni.c hash_ripemd.c hash_haval.c ' + 'hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c ' + 'hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c ' + 'hash_sha3.c hash_murmur.c hash_xxhash.c', false); diff --git a/ext/hash/hash_sha.c b/ext/hash/hash_sha.c index 201a8513f96..3129446fcde 100644 --- a/ext/hash/hash_sha.c +++ b/ext/hash/hash_sha.c @@ -17,6 +17,7 @@ #include "php_hash.h" #include "php_hash_sha.h" +#include "Zend/zend_cpuinfo.h" static const unsigned char PADDING[128] = { @@ -160,6 +161,24 @@ PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX * context, ZEND_ATTRIBUTE_UN */ static void SHA256Transform(uint32_t state[8], const unsigned char block[64]) { +#if defined(PHP_HASH_INTRIN_SHA_NATIVE) + SHA256_Transform_shani(state, block); + return; +#elif defined(PHP_HASH_INTRIN_SHA_RESOLVER) + if (zend_cpu_supports(ZEND_CPU_FEATURE_SSSE3) && zend_cpu_supports(ZEND_CPU_FEATURE_SHA)) { + SHA256_Transform_shani(state, block); + return; + } +#endif + +#if defined(__SSE2__) + uint32_t tmp32[72]; + + SHA256_Transform_sse2(state, block, &tmp32[0], &tmp32[64]); + ZEND_SECURE_ZERO((unsigned char*) tmp32, sizeof(tmp32)); + return; +#endif + uint32_t a = state[0], b = state[1], c = state[2], d = state[3]; uint32_t e = state[4], f = state[5], g = state[6], h = state[7]; uint32_t x[16], T1, T2, W[64]; diff --git a/ext/hash/hash_sha_ni.c b/ext/hash/hash_sha_ni.c new file mode 100644 index 00000000000..3c98786c73f --- /dev/null +++ b/ext/hash/hash_sha_ni.c @@ -0,0 +1,176 @@ +/*- + * Copyright 2018 Tarsnap Backup Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "php_hash.h" +#include "php_hash_sha.h" + +#if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_IMMINTRIN_H) + +# include + +# if PHP_HASH_INTRIN_SHA_RESOLVER +static __m128i be32dec_128(const uint8_t * src) __attribute__((target("ssse3"))); +void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]) __attribute__((target("ssse3,sha"))); +# endif + +/* Original implementation from libcperciva follows. + * + * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility. + */ + +/** + * This code uses intrinsics from the following feature sets: + * SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32 + * SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8 + * SSE2: Everything else + * + * The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2 + * instructions in their place; we have not done this since to our knowledge + * there are presently no CPUs which support the SHANI instruction set but do + * not support SSSE3. + */ + +/* Load 32-bit big-endian words. */ +static __m128i +be32dec_128(const uint8_t * src) +{ + const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, + 4, 5, 6, 7, 0, 1, 2, 3); + __m128i x; + + /* Load four 32-bit words. */ + x = _mm_loadu_si128((const __m128i *)src); + + /* Reverse the order of the bytes in each word. */ + return (_mm_shuffle_epi8(x, SHUF)); +} + +/* Convert an unsigned 32-bit immediate into a signed value. */ +#define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ? \ + -(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a)) + +/* Load four unsigned 32-bit immediates into a vector register. */ +#define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d)) + +/* Run four rounds of SHA256. */ +#define RND4(S, W, K0, K1, K2, K3) do { \ + __m128i M; \ + \ + /* Add the next four words of message schedule and round constants. */ \ + M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0)); \ + \ + /* Perform two rounds of SHA256, using the low two words in M. */ \ + S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M); \ + \ + /* Shift the two words of M down and perform the next two rounds. */ \ + M = _mm_srli_si128(M, 8); \ + S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M); \ +} while (0) + +/* Compute the ith set of four words of message schedule. */ +#define MSG4(W, i) do { \ + W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]); \ + W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4], \ + _mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4)); \ + W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]); \ +} while (0) + +/* Perform 4 rounds of SHA256 and generate more message schedule if needed. */ +#define RNDMSG(S, W, i, K0, K1, K2, K3) do { \ + RND4(S, W[i % 4], K0, K1, K2, K3); \ + if (i < 12) \ + MSG4(W, i + 4); \ +} while (0) + +/** + * SHA256_Transform_shani(state, block): + * Compute the SHA256 block compression function, transforming ${state} using + * the data in ${block}. This implementation uses x86 SHANI and SSSE3 + * instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3 + * are defined and cpusupport_x86_shani() and _ssse3() return nonzero. + */ +void +SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], + const uint8_t block[PHP_STATIC_RESTRICT 64]) +{ + __m128i S3210, S7654; + __m128i S0123, S4567; + __m128i S0145, S2367; + __m128i W[4]; + __m128i S[2]; + + /* Load state. */ + S3210 = _mm_loadu_si128((const __m128i *)&state[0]); + S7654 = _mm_loadu_si128((const __m128i *)&state[4]); + + /* Shuffle the 8 32-bit values into the order we need them. */ + S0123 = _mm_shuffle_epi32(S3210, 0x1B); + S4567 = _mm_shuffle_epi32(S7654, 0x1B); + S0145 = _mm_unpackhi_epi64(S4567, S0123); + S2367 = _mm_unpacklo_epi64(S4567, S0123); + + /* Load input block; this is the start of the message schedule. */ + W[0] = be32dec_128(&block[0]); + W[1] = be32dec_128(&block[16]); + W[2] = be32dec_128(&block[32]); + W[3] = be32dec_128(&block[48]); + + /* Initialize working variables. */ + S[0] = S0145; + S[1] = S2367; + + /* Perform 64 rounds, 4 at a time. */ + RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5); + RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5); + RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3); + RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174); + RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc); + RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da); + RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7); + RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967); + RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13); + RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85); + RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3); + RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070); + RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5); + RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3); + RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208); + RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2); + + /* Mix local working variables into global state. */ + S0145 = _mm_add_epi32(S0145, S[0]); + S2367 = _mm_add_epi32(S2367, S[1]); + + /* Shuffle state back to the original word order and store. */ + S0123 = _mm_unpackhi_epi64(S2367, S0145); + S4567 = _mm_unpacklo_epi64(S2367, S0145); + S3210 = _mm_shuffle_epi32(S0123, 0x1B); + S7654 = _mm_shuffle_epi32(S4567, 0x1B); + _mm_storeu_si128((__m128i *)&state[0], S3210); + _mm_storeu_si128((__m128i *)&state[4], S7654); +} + +#endif diff --git a/ext/hash/hash_sha_sse2.c b/ext/hash/hash_sha_sse2.c new file mode 100644 index 00000000000..7185a587b15 --- /dev/null +++ b/ext/hash/hash_sha_sse2.c @@ -0,0 +1,257 @@ +/*- + * Copyright 2021 Tarsnap Backup Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "php_hash.h" +#include "php_hash_sha.h" + +#ifdef __SSE2__ +# include + +/* Original implementation from libcperciva follows. + * + * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility. + */ + +/** + * mm_bswap_epi32(a): + * Byte-swap each 32-bit word. + */ +static inline __m128i +mm_bswap_epi32(__m128i a) +{ + + /* Swap bytes in each 16-bit word. */ + a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8)); + + /* Swap all 16-bit words. */ + a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); + a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); + + return (a); +} + +/* SHA256 round constants. */ +static const uint32_t Krnd[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + Krnd[i + ii]) + +/* Message schedule computation */ +#define SHR32(x, n) (_mm_srli_epi32(x, n)) +#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n)))) +#define s0_128(x) _mm_xor_si128(_mm_xor_si128( \ + ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3)) + +static inline __m128i +s1_128_high(__m128i a) +{ + __m128i b; + __m128i c; + + /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0)); + c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); + + /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ + c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); + + /* Shuffle good data back and zero unwanted lanes. */ + c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); + c = _mm_slli_si128(c, 8); + + return (c); +} + +static inline __m128i +s1_128_low(__m128i a) +{ + __m128i b; + __m128i c; + + /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2)); + c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); + + /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ + c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); + + /* Shuffle good data back and zero unwanted lanes. */ + c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); + c = _mm_srli_si128(c, 8); + + return (c); +} + +/** + * SPAN_ONE_THREE(a, b): + * Combine the upper three words of ${a} with the lowest word of ${b}. This + * could also be thought of returning bits [159:32] of the 256-bit value + * consisting of (b[127:0] a[127:0]). In other words, set: + * dst[31:0] := a[63:32] + * dst[63:32] := a[95:64] + * dst[95:64] := a[127:96] + * dst[127:96] := b[31:0] + */ +#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \ + _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \ + _MM_SHUFFLE(0, 3, 2, 1))) + +/** + * MSG4(X0, X1, X2, X3): + * Calculate the next four values of the message schedule. If we define + * ${W[j]} as the first unknown value in the message schedule, then the input + * arguments are: + * X0 = W[j - 16] : W[j - 13] + * X1 = W[j - 12] : W[j - 9] + * X2 = W[j - 8] : W[j - 5] + * X3 = W[j - 4] : W[j - 1] + * This function therefore calculates: + * X4 = W[j + 0] : W[j + 3] + */ +static inline __m128i +MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3) +{ + __m128i X4; + __m128i Xj_minus_seven, Xj_minus_fifteen; + + /* Set up variables which span X values. */ + Xj_minus_seven = SPAN_ONE_THREE(X2, X3); + Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1); + + /* Begin computing X4. */ + X4 = _mm_add_epi32(X0, Xj_minus_seven); + X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen)); + + /* First half of s1. */ + X4 = _mm_add_epi32(X4, s1_128_low(X3)); + + /* Second half of s1; this depends on the above value of X4. */ + X4 = _mm_add_epi32(X4, s1_128_high(X4)); + + return (X4); +} + +/** + * SHA256_Transform_sse2(state, block, W, S): + * Compute the SHA256 block compression function, transforming ${state} using + * the data in ${block}. This implementation uses x86 SSE2 instructions, and + * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns + * nonzero. The arrays W and S may be filled with sensitive data, and should + * be cleared by the callee. + */ +void +SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8], + const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64], + uint32_t S[PHP_STATIC_RESTRICT 8]) +{ + __m128i Y[4]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0])); + _mm_storeu_si128((__m128i *)&W[0], Y[0]); + Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16])); + _mm_storeu_si128((__m128i *)&W[4], Y[1]); + Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32])); + _mm_storeu_si128((__m128i *)&W[8], Y[2]); + Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48])); + _mm_storeu_si128((__m128i *)&W[12], Y[3]); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]); + _mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]); + Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]); + _mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]); + Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]); + _mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]); + Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]); + _mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]); + } + + /* 4. Mix local working variables into global state. */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +#endif diff --git a/ext/hash/php_hash_sha.h b/ext/hash/php_hash_sha.h index a7d71f73fe9..d5282026b96 100644 --- a/ext/hash/php_hash_sha.h +++ b/ext/hash/php_hash_sha.h @@ -45,6 +45,27 @@ typedef struct { #define PHP_SHA256Init(ctx) PHP_SHA256InitArgs(ctx, NULL) PHP_HASH_API void PHP_SHA256InitArgs(PHP_SHA256_CTX *, ZEND_ATTRIBUTE_UNUSED HashTable *); PHP_HASH_API void PHP_SHA256Update(PHP_SHA256_CTX *, const unsigned char *, size_t); + +#ifdef _MSC_VER +# define PHP_STATIC_RESTRICT +#else +# define PHP_STATIC_RESTRICT static restrict +#endif + +#if defined(__SSE2__) +void SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64], uint32_t S[PHP_STATIC_RESTRICT 8]); +#endif + +#if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_IMMINTRIN_H) +# if defined(__SSSE3__) && defined(__SHA__) +# define PHP_HASH_INTRIN_SHA_NATIVE 1 +# elif defined(HAVE_FUNC_ATTRIBUTE_TARGET) +# define PHP_HASH_INTRIN_SHA_RESOLVER 1 +# endif + +void SHA256_Transform_shani(uint32_t state[PHP_STATIC_RESTRICT 8], const uint8_t block[PHP_STATIC_RESTRICT 64]); +#endif + PHP_HASH_API void PHP_SHA256Final(unsigned char[32], PHP_SHA256_CTX *); /* SHA384 context */