]> www.wagner.pp.ru Git - openssl-gost/engine.git/blob - gosthash2012_sse2.h
gosthash2012: Enable building with SIMD implementation
[openssl-gost/engine.git] / gosthash2012_sse2.h
1 /*
2  * Implementation of core functions for GOST R 34.11-2012 using SSE2.
3  *
4  * Copyright (c) 2013 Cryptocom LTD.
5  * This file is distributed under the same license as OpenSSL.
6  *
7  * Author: Alexey Degtyarev <alexey@renatasystems.org>
8  *
9  */
10
11 #ifndef __GOST3411_HAS_SSE2__
12 # error "GOST R 34.11-2012: SSE2 not enabled"
13 #endif
14
15 # pragma message "Use SIMD implementation"
16
17 #include <mmintrin.h>
18 #include <emmintrin.h>
19
20 #define LO(v) ((unsigned char) (v))
21 #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
22
23 #ifdef __i386__
24 # define EXTRACT EXTRACT32
25 #else
26 # define EXTRACT EXTRACT64
27 #endif
28
29 #ifndef __ICC
30 # define _mm_cvtsi64_m64(v) (__m64) v
31 # define _mm_cvtm64_si64(v) (long long) v
32 #endif
33
34 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
35     const __m128i *__m128p = (const __m128i *) &P[0]; \
36     xmm0 = _mm_load_si128(&__m128p[0]); \
37     xmm1 = _mm_load_si128(&__m128p[1]); \
38     xmm2 = _mm_load_si128(&__m128p[2]); \
39     xmm3 = _mm_load_si128(&__m128p[3]); \
40 }
41
42 #define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
43     __m128i *__m128p = (__m128i *) &P[0]; \
44     _mm_store_si128(&__m128p[0], xmm0); \
45     _mm_store_si128(&__m128p[1], xmm1); \
46     _mm_store_si128(&__m128p[2], xmm2); \
47     _mm_store_si128(&__m128p[3], xmm3); \
48 }
49
50 #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
51     xmm0 = _mm_xor_si128(xmm0, xmm4); \
52     xmm1 = _mm_xor_si128(xmm1, xmm5); \
53     xmm2 = _mm_xor_si128(xmm2, xmm6); \
54     xmm3 = _mm_xor_si128(xmm3, xmm7); \
55 }
56
57 #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
58     const __m128i *__m128p = (const __m128i *) &P[0]; \
59     xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
60     xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
61     xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
62     xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
63 }
64
65 #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
66
67 #define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
68     register unsigned short ax; \
69     __m64 mm0, mm1; \
70      \
71     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
72     mm0  = _mm_cvtsi64_m64(Ax[0][LO(ax)]); \
73     mm1  = _mm_cvtsi64_m64(Ax[0][HI(ax)]); \
74     \
75     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
76     mm0 = _mm_xor_64(mm0, Ax[1][LO(ax)]); \
77     mm1 = _mm_xor_64(mm1, Ax[1][HI(ax)]); \
78     \
79     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
80     mm0 = _mm_xor_64(mm0, Ax[2][LO(ax)]); \
81     mm1 = _mm_xor_64(mm1, Ax[2][HI(ax)]); \
82     \
83     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
84     mm0 = _mm_xor_64(mm0, Ax[3][LO(ax)]); \
85     mm1 = _mm_xor_64(mm1, Ax[3][HI(ax)]); \
86     \
87     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
88     mm0 = _mm_xor_64(mm0, Ax[4][LO(ax)]); \
89     mm1 = _mm_xor_64(mm1, Ax[4][HI(ax)]); \
90     \
91     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
92     mm0 = _mm_xor_64(mm0, Ax[5][LO(ax)]); \
93     mm1 = _mm_xor_64(mm1, Ax[5][HI(ax)]); \
94     \
95     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
96     mm0 = _mm_xor_64(mm0, Ax[6][LO(ax)]); \
97     mm1 = _mm_xor_64(mm1, Ax[6][HI(ax)]); \
98     \
99     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
100     mm0 = _mm_xor_64(mm0, Ax[7][LO(ax)]); \
101     mm1 = _mm_xor_64(mm1, Ax[7][HI(ax)]); \
102     \
103     xmm4 = _mm_set_epi64(mm1, mm0); \
104 }
105
106 #define __EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
107     __m128i tmm4; \
108     register unsigned long long r0, r1; \
109     r0  = Ax[0][_mm_extract_epi8(xmm0, row + 0)]; \
110     r0 ^= Ax[1][_mm_extract_epi8(xmm0, row + 8)]; \
111     r0 ^= Ax[2][_mm_extract_epi8(xmm1, row + 0)]; \
112     r0 ^= Ax[3][_mm_extract_epi8(xmm1, row + 8)]; \
113     r0 ^= Ax[4][_mm_extract_epi8(xmm2, row + 0)]; \
114     r0 ^= Ax[5][_mm_extract_epi8(xmm2, row + 8)]; \
115     r0 ^= Ax[6][_mm_extract_epi8(xmm3, row + 0)]; \
116     r0 ^= Ax[7][_mm_extract_epi8(xmm3, row + 8)]; \
117     \
118     r1  = Ax[0][_mm_extract_epi8(xmm0, row + 1)]; \
119     r1 ^= Ax[1][_mm_extract_epi8(xmm0, row + 9)]; \
120     r1 ^= Ax[2][_mm_extract_epi8(xmm1, row + 1)]; \
121     r1 ^= Ax[3][_mm_extract_epi8(xmm1, row + 9)]; \
122     r1 ^= Ax[4][_mm_extract_epi8(xmm2, row + 1)]; \
123     r1 ^= Ax[5][_mm_extract_epi8(xmm2, row + 9)]; \
124     r1 ^= Ax[6][_mm_extract_epi8(xmm3, row + 1)]; \
125     r1 ^= Ax[7][_mm_extract_epi8(xmm3, row + 9)]; \
126     xmm4 = _mm_cvtsi64_si128((long long) r0); \
127     tmm4 = _mm_cvtsi64_si128((long long) r1); \
128     xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
129 }
130
131 #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
132     __m128i tmm4; \
133     register unsigned short ax; \
134     register unsigned long long r0, r1; \
135      \
136     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
137     r0  = Ax[0][LO(ax)]; \
138     r1  = Ax[0][HI(ax)]; \
139     \
140     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
141     r0 ^= Ax[1][LO(ax)]; \
142     r1 ^= Ax[1][HI(ax)]; \
143     \
144     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
145     r0 ^= Ax[2][LO(ax)]; \
146     r1 ^= Ax[2][HI(ax)]; \
147     \
148     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
149     r0 ^= Ax[3][LO(ax)]; \
150     r1 ^= Ax[3][HI(ax)]; \
151     \
152     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
153     r0 ^= Ax[4][LO(ax)]; \
154     r1 ^= Ax[4][HI(ax)]; \
155     \
156     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
157     r0 ^= Ax[5][LO(ax)]; \
158     r1 ^= Ax[5][HI(ax)]; \
159     \
160     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
161     r0 ^= Ax[6][LO(ax)]; \
162     r1 ^= Ax[6][HI(ax)]; \
163     \
164     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
165     r0 ^= Ax[7][LO(ax)]; \
166     r1 ^= Ax[7][HI(ax)]; \
167     \
168     xmm4 = _mm_cvtsi64_si128((long long) r0); \
169     tmm4 = _mm_cvtsi64_si128((long long) r1); \
170     xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
171 }
172
173 #define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
174     __m128i tmm0, tmm1, tmm2, tmm3; \
175     X128M(P, xmm0, xmm1, xmm2, xmm3); \
176     \
177     EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
178     EXTRACT(1, xmm0, xmm1, xmm2, xmm3, tmm1); \
179     EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm2); \
180     EXTRACT(3, xmm0, xmm1, xmm2, xmm3, tmm3); \
181     \
182     xmm0 = tmm0; \
183     xmm1 = tmm1; \
184     xmm2 = tmm2; \
185     xmm3 = tmm3; \
186 }
187
188 #define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
189     __m128i tmm0, tmm1, tmm2, tmm3; \
190     X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
191     \
192     EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
193     EXTRACT(1, xmm4, xmm5, xmm6, xmm7, tmm1); \
194     EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm2); \
195     EXTRACT(3, xmm4, xmm5, xmm6, xmm7, tmm3); \
196     \
197     xmm4 = tmm0; \
198     xmm5 = tmm1; \
199     xmm6 = tmm2; \
200     xmm7 = tmm3; \
201 }
202
203 #define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
204     XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
205     XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \
206 }