From 211489fa492c33247383f71ce269858f512874ec Mon Sep 17 00:00:00 2001 From: dmitry dulesov Date: Sat, 1 Feb 2020 02:59:07 +0300 Subject: [PATCH] gosthash2012: Improve SIMD implementation Allow aligned/unaligned access. Use better intrinsics and other small optimizations. Committed-by: Vitaly Chikunov --- gosthash2012.c | 10 +++----- gosthash2012.h | 11 ++++++++ gosthash2012_sse2.h | 61 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/gosthash2012.c b/gosthash2012.c index d6cde21..6399a9e 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -118,7 +118,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, LOAD(N, xmm0, xmm2, xmm4, xmm6); XLPS128M(h, xmm0, xmm2, xmm4, xmm6); - LOAD(m, xmm1, xmm3, xmm5, xmm7); + ULOAD(m, xmm1, xmm3, xmm5, xmm7); XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); for (i = 0; i < 11; i++) @@ -128,12 +128,10 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); X128M(h, xmm0, xmm2, xmm4, xmm6); - X128M(m, xmm0, xmm2, xmm4, xmm6); - - UNLOAD(h, xmm0, xmm2, xmm4, xmm6); + ULOAD(m, xmm1, xmm3, xmm5, xmm7); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - /* Restore the Floating-point status on the CPU */ - _mm_empty(); + STORE(h, xmm0, xmm2, xmm4, xmm6); #else union uint512_u Ki, data; unsigned int i; diff --git a/gosthash2012.h b/gosthash2012.h index 5b1cd9f..8947ee6 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -12,6 +12,17 @@ #ifdef __SSE2__ # define __GOST3411_HAS_SSE2__ +# if !defined(__x86_64__) +/* + * x86-64 bit Linux and Windows ABIs provide malloc function that returns + * 16-byte alignment memory buffers required by SSE load/store instructions. + * Other platforms require special trick for proper gost2012_hash_ctx structure + * allocation. It will be easier to switch to unaligned loadu/storeu memory + * access instructions in this case. + */ +# define UNALIGNED_SIMD_ACCESS +# pragma message "Use unaligned SIMD memory access" +# endif #endif #ifdef __GOST3411_HAS_SSE2__ diff --git a/gosthash2012_sse2.h b/gosthash2012_sse2.h index f45dab1..188f7b8 100644 --- a/gosthash2012_sse2.h +++ b/gosthash2012_sse2.h @@ -16,6 +16,9 @@ #include #include +#ifdef __SSE3__ +# include +#endif #define LO(v) ((unsigned char) (v)) #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8)) @@ -31,20 +34,50 @@ # define _mm_cvtm64_si64(v) (long long) v #endif +#ifdef __SSE3__ +/* + * "This intrinsic may perform better than _mm_loadu_si128 when + * the data crosses a cache line boundary." + */ +# define UMEM_READ_I128 _mm_lddqu_si128 +#else /* SSE2 */ +# define UMEM_READ_I128 _mm_loadu_si128 +#endif + +/* load 512bit from unaligned memory */ +#define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) P; \ + xmm0 = UMEM_READ_I128(&__m128p[0]); \ + xmm1 = UMEM_READ_I128(&__m128p[1]); \ + xmm2 = UMEM_READ_I128(&__m128p[2]); \ + xmm3 = UMEM_READ_I128(&__m128p[3]); \ +} + +#ifdef UNALIGNED_SIMD_ACCESS + +# define MEM_WRITE_I128 _mm_storeu_si128 +# define MEM_READ_I128 UMEM_READ_I128 +# define LOAD ULOAD + +#else /* !UNALIGNED_SIMD_ACCESS */ + +# define MEM_WRITE_I128 _mm_store_si128 +# define MEM_READ_I128 _mm_load_si128 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \ - const __m128i *__m128p = (const __m128i *) &P[0]; \ - xmm0 = _mm_load_si128(&__m128p[0]); \ - xmm1 = _mm_load_si128(&__m128p[1]); \ - xmm2 = _mm_load_si128(&__m128p[2]); \ - xmm3 = _mm_load_si128(&__m128p[3]); \ + const __m128i *__m128p = (const __m128i *) P; \ + xmm0 = MEM_READ_I128(&__m128p[0]); \ + xmm1 = MEM_READ_I128(&__m128p[1]); \ + xmm2 = MEM_READ_I128(&__m128p[2]); \ + xmm3 = MEM_READ_I128(&__m128p[3]); \ } +#endif /* !UNALIGNED_SIMD_ACCESS */ -#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \ +#define STORE(P, xmm0, xmm1, xmm2, xmm3) { \ __m128i *__m128p = (__m128i *) &P[0]; \ - _mm_store_si128(&__m128p[0], xmm0); \ - _mm_store_si128(&__m128p[1], xmm1); \ - _mm_store_si128(&__m128p[2], xmm2); \ - _mm_store_si128(&__m128p[3], xmm3); \ + MEM_WRITE_I128(&__m128p[0], xmm0); \ + MEM_WRITE_I128(&__m128p[1], xmm1); \ + MEM_WRITE_I128(&__m128p[2], xmm2); \ + MEM_WRITE_I128(&__m128p[3], xmm3); \ } #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ @@ -56,10 +89,10 @@ #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \ const __m128i *__m128p = (const __m128i *) &P[0]; \ - xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \ - xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \ - xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \ - xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \ + xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \ + xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \ + xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \ + xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \ } #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1)) -- 2.39.2