X-Git-Url: http://www.wagner.pp.ru/gitweb/?p=openssl-gost%2Fengine.git;a=blobdiff_plain;f=gosthash2012_sse2.h;h=07e2ec0079caf925417918d879b4a0d36f6b67e7;hp=6401beb9584a1ff2b6e51afd84c12bc807f7a3f3;hb=HEAD;hpb=57d07eb0dc22bee10aebb0bd37cbdf2258413564 diff --git a/gosthash2012_sse2.h b/gosthash2012_sse2.h index 6401beb..07e2ec0 100644 --- a/gosthash2012_sse2.h +++ b/gosthash2012_sse2.h @@ -14,6 +14,9 @@ #include #include +#ifdef __SSE3__ +# include +#endif #define LO(v) ((unsigned char) (v)) #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8)) @@ -29,20 +32,50 @@ # define _mm_cvtm64_si64(v) (long long) v #endif +#ifdef __SSE3__ +/* + * "This intrinsic may perform better than _mm_loadu_si128 when + * the data crosses a cache line boundary." + */ +# define UMEM_READ_I128 _mm_lddqu_si128 +#else /* SSE2 */ +# define UMEM_READ_I128 _mm_loadu_si128 +#endif + +/* load 512bit from unaligned memory */ +#define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) P; \ + xmm0 = UMEM_READ_I128(&__m128p[0]); \ + xmm1 = UMEM_READ_I128(&__m128p[1]); \ + xmm2 = UMEM_READ_I128(&__m128p[2]); \ + xmm3 = UMEM_READ_I128(&__m128p[3]); \ +} + +#ifdef UNALIGNED_SIMD_ACCESS + +# define MEM_WRITE_I128 _mm_storeu_si128 +# define MEM_READ_I128 UMEM_READ_I128 +# define LOAD ULOAD + +#else /* !UNALIGNED_SIMD_ACCESS */ + +# define MEM_WRITE_I128 _mm_store_si128 +# define MEM_READ_I128 _mm_load_si128 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \ - const __m128i *__m128p = (const __m128i *) &P[0]; \ - xmm0 = _mm_load_si128(&__m128p[0]); \ - xmm1 = _mm_load_si128(&__m128p[1]); \ - xmm2 = _mm_load_si128(&__m128p[2]); \ - xmm3 = _mm_load_si128(&__m128p[3]); \ + const __m128i *__m128p = (const __m128i *) P; \ + xmm0 = MEM_READ_I128(&__m128p[0]); \ + xmm1 = MEM_READ_I128(&__m128p[1]); \ + xmm2 = MEM_READ_I128(&__m128p[2]); \ + xmm3 = MEM_READ_I128(&__m128p[3]); \ } +#endif /* !UNALIGNED_SIMD_ACCESS */ -#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \ +#define STORE(P, xmm0, xmm1, xmm2, xmm3) { \ __m128i *__m128p = (__m128i *) &P[0]; \ - _mm_store_si128(&__m128p[0], xmm0); \ - _mm_store_si128(&__m128p[1], xmm1); \ - _mm_store_si128(&__m128p[2], xmm2); \ - _mm_store_si128(&__m128p[3], xmm3); \ + MEM_WRITE_I128(&__m128p[0], xmm0); \ + MEM_WRITE_I128(&__m128p[1], xmm1); \ + MEM_WRITE_I128(&__m128p[2], xmm2); \ + MEM_WRITE_I128(&__m128p[3], xmm3); \ } #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ @@ -54,10 +87,10 @@ #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \ const __m128i *__m128p = (const __m128i *) &P[0]; \ - xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \ - xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \ - xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \ - xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \ + xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \ + xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \ + xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \ + xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \ } #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1)) @@ -101,31 +134,6 @@ xmm4 = _mm_set_epi64(mm1, mm0); \ } -#define __EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ - __m128i tmm4; \ - register unsigned long long r0, r1; \ - r0 = Ax[0][_mm_extract_epi8(xmm0, row + 0)]; \ - r0 ^= Ax[1][_mm_extract_epi8(xmm0, row + 8)]; \ - r0 ^= Ax[2][_mm_extract_epi8(xmm1, row + 0)]; \ - r0 ^= Ax[3][_mm_extract_epi8(xmm1, row + 8)]; \ - r0 ^= Ax[4][_mm_extract_epi8(xmm2, row + 0)]; \ - r0 ^= Ax[5][_mm_extract_epi8(xmm2, row + 8)]; \ - r0 ^= Ax[6][_mm_extract_epi8(xmm3, row + 0)]; \ - r0 ^= Ax[7][_mm_extract_epi8(xmm3, row + 8)]; \ - \ - r1 = Ax[0][_mm_extract_epi8(xmm0, row + 1)]; \ - r1 ^= Ax[1][_mm_extract_epi8(xmm0, row + 9)]; \ - r1 ^= Ax[2][_mm_extract_epi8(xmm1, row + 1)]; \ - r1 ^= Ax[3][_mm_extract_epi8(xmm1, row + 9)]; \ - r1 ^= Ax[4][_mm_extract_epi8(xmm2, row + 1)]; \ - r1 ^= Ax[5][_mm_extract_epi8(xmm2, row + 9)]; \ - r1 ^= Ax[6][_mm_extract_epi8(xmm3, row + 1)]; \ - r1 ^= Ax[7][_mm_extract_epi8(xmm3, row + 9)]; \ - xmm4 = _mm_cvtsi64_si128((long long) r0); \ - tmm4 = _mm_cvtsi64_si128((long long) r1); \ - xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \ -} - #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ __m128i tmm4; \ register unsigned short ax; \