]> www.wagner.pp.ru Git - openssl-gost/engine.git/blob - gosthash2012_sse2.h
tcl_tests: ca.try: Ignore openssl crl exit status for 'corrupted CRL' test
[openssl-gost/engine.git] / gosthash2012_sse2.h
1 /*
2  * Implementation of core functions for GOST R 34.11-2012 using SSE2.
3  *
4  * Copyright (c) 2013 Cryptocom LTD.
5  * This file is distributed under the same license as OpenSSL.
6  *
7  * Author: Alexey Degtyarev <alexey@renatasystems.org>
8  *
9  */
10
11 #ifndef __GOST3411_HAS_SSE2__
12 # error "GOST R 34.11-2012: SSE2 not enabled"
13 #endif
14
15 #include <mmintrin.h>
16 #include <emmintrin.h>
17 #ifdef __SSE3__
18 # include <pmmintrin.h>
19 #endif
20
21 #define LO(v) ((unsigned char) (v))
22 #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
23
24 #ifdef __i386__
25 # define EXTRACT EXTRACT32
26 #else
27 # define EXTRACT EXTRACT64
28 #endif
29
30 #ifndef __ICC
31 # define _mm_cvtsi64_m64(v) (__m64) v
32 # define _mm_cvtm64_si64(v) (long long) v
33 #endif
34
35 #ifdef __SSE3__
36 /*
37  * "This intrinsic may perform better than _mm_loadu_si128 when
38  * the data crosses a cache line boundary."
39  */
40 # define UMEM_READ_I128 _mm_lddqu_si128
41 #else /* SSE2 */
42 # define UMEM_READ_I128 _mm_loadu_si128
43 #endif
44
45 /* load 512bit from unaligned memory  */
46 #define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \
47     const __m128i *__m128p = (const __m128i *) P; \
48     xmm0 = UMEM_READ_I128(&__m128p[0]); \
49     xmm1 = UMEM_READ_I128(&__m128p[1]); \
50     xmm2 = UMEM_READ_I128(&__m128p[2]); \
51     xmm3 = UMEM_READ_I128(&__m128p[3]); \
52 }
53
54 #ifdef UNALIGNED_SIMD_ACCESS
55
56 # define MEM_WRITE_I128  _mm_storeu_si128
57 # define MEM_READ_I128   UMEM_READ_I128
58 # define LOAD            ULOAD
59
60 #else /* !UNALIGNED_SIMD_ACCESS */
61
62 # define MEM_WRITE_I128   _mm_store_si128
63 # define MEM_READ_I128   _mm_load_si128
64 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
65     const __m128i *__m128p = (const __m128i *) P; \
66     xmm0 = MEM_READ_I128(&__m128p[0]); \
67     xmm1 = MEM_READ_I128(&__m128p[1]); \
68     xmm2 = MEM_READ_I128(&__m128p[2]); \
69     xmm3 = MEM_READ_I128(&__m128p[3]); \
70 }
71 #endif /* !UNALIGNED_SIMD_ACCESS */
72
73 #define STORE(P, xmm0, xmm1, xmm2, xmm3) { \
74     __m128i *__m128p = (__m128i *) &P[0]; \
75     MEM_WRITE_I128(&__m128p[0], xmm0); \
76     MEM_WRITE_I128(&__m128p[1], xmm1); \
77     MEM_WRITE_I128(&__m128p[2], xmm2); \
78     MEM_WRITE_I128(&__m128p[3], xmm3); \
79 }
80
81 #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
82     xmm0 = _mm_xor_si128(xmm0, xmm4); \
83     xmm1 = _mm_xor_si128(xmm1, xmm5); \
84     xmm2 = _mm_xor_si128(xmm2, xmm6); \
85     xmm3 = _mm_xor_si128(xmm3, xmm7); \
86 }
87
88 #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
89     const __m128i *__m128p = (const __m128i *) &P[0]; \
90     xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \
91     xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \
92     xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \
93     xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \
94 }
95
96 #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
97
98 #define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
99     register unsigned short ax; \
100     __m64 mm0, mm1; \
101      \
102     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
103     mm0  = _mm_cvtsi64_m64(Ax[0][LO(ax)]); \
104     mm1  = _mm_cvtsi64_m64(Ax[0][HI(ax)]); \
105     \
106     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
107     mm0 = _mm_xor_64(mm0, Ax[1][LO(ax)]); \
108     mm1 = _mm_xor_64(mm1, Ax[1][HI(ax)]); \
109     \
110     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
111     mm0 = _mm_xor_64(mm0, Ax[2][LO(ax)]); \
112     mm1 = _mm_xor_64(mm1, Ax[2][HI(ax)]); \
113     \
114     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
115     mm0 = _mm_xor_64(mm0, Ax[3][LO(ax)]); \
116     mm1 = _mm_xor_64(mm1, Ax[3][HI(ax)]); \
117     \
118     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
119     mm0 = _mm_xor_64(mm0, Ax[4][LO(ax)]); \
120     mm1 = _mm_xor_64(mm1, Ax[4][HI(ax)]); \
121     \
122     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
123     mm0 = _mm_xor_64(mm0, Ax[5][LO(ax)]); \
124     mm1 = _mm_xor_64(mm1, Ax[5][HI(ax)]); \
125     \
126     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
127     mm0 = _mm_xor_64(mm0, Ax[6][LO(ax)]); \
128     mm1 = _mm_xor_64(mm1, Ax[6][HI(ax)]); \
129     \
130     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
131     mm0 = _mm_xor_64(mm0, Ax[7][LO(ax)]); \
132     mm1 = _mm_xor_64(mm1, Ax[7][HI(ax)]); \
133     \
134     xmm4 = _mm_set_epi64(mm1, mm0); \
135 }
136
137 #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
138     __m128i tmm4; \
139     register unsigned short ax; \
140     register unsigned long long r0, r1; \
141      \
142     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
143     r0  = Ax[0][LO(ax)]; \
144     r1  = Ax[0][HI(ax)]; \
145     \
146     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
147     r0 ^= Ax[1][LO(ax)]; \
148     r1 ^= Ax[1][HI(ax)]; \
149     \
150     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
151     r0 ^= Ax[2][LO(ax)]; \
152     r1 ^= Ax[2][HI(ax)]; \
153     \
154     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
155     r0 ^= Ax[3][LO(ax)]; \
156     r1 ^= Ax[3][HI(ax)]; \
157     \
158     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
159     r0 ^= Ax[4][LO(ax)]; \
160     r1 ^= Ax[4][HI(ax)]; \
161     \
162     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
163     r0 ^= Ax[5][LO(ax)]; \
164     r1 ^= Ax[5][HI(ax)]; \
165     \
166     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
167     r0 ^= Ax[6][LO(ax)]; \
168     r1 ^= Ax[6][HI(ax)]; \
169     \
170     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
171     r0 ^= Ax[7][LO(ax)]; \
172     r1 ^= Ax[7][HI(ax)]; \
173     \
174     xmm4 = _mm_cvtsi64_si128((long long) r0); \
175     tmm4 = _mm_cvtsi64_si128((long long) r1); \
176     xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
177 }
178
179 #define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
180     __m128i tmm0, tmm1, tmm2, tmm3; \
181     X128M(P, xmm0, xmm1, xmm2, xmm3); \
182     \
183     EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
184     EXTRACT(1, xmm0, xmm1, xmm2, xmm3, tmm1); \
185     EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm2); \
186     EXTRACT(3, xmm0, xmm1, xmm2, xmm3, tmm3); \
187     \
188     xmm0 = tmm0; \
189     xmm1 = tmm1; \
190     xmm2 = tmm2; \
191     xmm3 = tmm3; \
192 }
193
194 #define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
195     __m128i tmm0, tmm1, tmm2, tmm3; \
196     X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
197     \
198     EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
199     EXTRACT(1, xmm4, xmm5, xmm6, xmm7, tmm1); \
200     EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm2); \
201     EXTRACT(3, xmm4, xmm5, xmm6, xmm7, tmm3); \
202     \
203     xmm4 = tmm0; \
204     xmm5 = tmm1; \
205     xmm6 = tmm2; \
206     xmm7 = tmm3; \
207 }
208
209 #define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
210     XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
211     XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \
212 }