]> www.wagner.pp.ru Git - openssl-gost/engine.git/blob - gosthash2012_sse2.h
Merge branch 'no_gost94_sig' into gost12_algs
[openssl-gost/engine.git] / gosthash2012_sse2.h
1 /*
2  * Implementation of core functions for GOST R 34.11-2012 using SSE2.
3  *
4  * Copyright (c) 2013 Cryptocom LTD.
5  * This file is distributed under the same license as OpenSSL.
6  *
7  * Author: Alexey Degtyarev <alexey@renatasystems.org>
8  *
9  */
10
11 #ifndef __GOST3411_HAS_SSE2__
12 # error "GOST R 34.11-2012: SSE2 not enabled"
13 #endif
14
15 #include <mmintrin.h>
16 #include <emmintrin.h>
17
18 #define LO(v) ((unsigned char) (v))
19 #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
20
21 #ifdef __i386__
22 # define EXTRACT EXTRACT32
23 #else
24 # define EXTRACT EXTRACT64
25 #endif
26
27 #ifndef __ICC
28 # define _mm_cvtsi64_m64(v) (__m64) v
29 # define _mm_cvtm64_si64(v) (long long) v
30 #endif
31
32 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
33     const __m128i *__m128p = (const __m128i *) &P[0]; \
34     xmm0 = _mm_load_si128(&__m128p[0]); \
35     xmm1 = _mm_load_si128(&__m128p[1]); \
36     xmm2 = _mm_load_si128(&__m128p[2]); \
37     xmm3 = _mm_load_si128(&__m128p[3]); \
38 }
39
40 #define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
41     __m128i *__m128p = (__m128i *) &P[0]; \
42     _mm_store_si128(&__m128p[0], xmm0); \
43     _mm_store_si128(&__m128p[1], xmm1); \
44     _mm_store_si128(&__m128p[2], xmm2); \
45     _mm_store_si128(&__m128p[3], xmm3); \
46 }
47
48 #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
49     xmm0 = _mm_xor_si128(xmm0, xmm4); \
50     xmm1 = _mm_xor_si128(xmm1, xmm5); \
51     xmm2 = _mm_xor_si128(xmm2, xmm6); \
52     xmm3 = _mm_xor_si128(xmm3, xmm7); \
53 }
54
55 #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
56     const __m128i *__m128p = (const __m128i *) &P[0]; \
57     xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
58     xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
59     xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
60     xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
61 }
62
63 #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
64
65 #define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
66     register unsigned short ax; \
67     __m64 mm0, mm1; \
68      \
69     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
70     mm0  = _mm_cvtsi64_m64(Ax[0][LO(ax)]); \
71     mm1  = _mm_cvtsi64_m64(Ax[0][HI(ax)]); \
72     \
73     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
74     mm0 = _mm_xor_64(mm0, Ax[1][LO(ax)]); \
75     mm1 = _mm_xor_64(mm1, Ax[1][HI(ax)]); \
76     \
77     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
78     mm0 = _mm_xor_64(mm0, Ax[2][LO(ax)]); \
79     mm1 = _mm_xor_64(mm1, Ax[2][HI(ax)]); \
80     \
81     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
82     mm0 = _mm_xor_64(mm0, Ax[3][LO(ax)]); \
83     mm1 = _mm_xor_64(mm1, Ax[3][HI(ax)]); \
84     \
85     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
86     mm0 = _mm_xor_64(mm0, Ax[4][LO(ax)]); \
87     mm1 = _mm_xor_64(mm1, Ax[4][HI(ax)]); \
88     \
89     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
90     mm0 = _mm_xor_64(mm0, Ax[5][LO(ax)]); \
91     mm1 = _mm_xor_64(mm1, Ax[5][HI(ax)]); \
92     \
93     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
94     mm0 = _mm_xor_64(mm0, Ax[6][LO(ax)]); \
95     mm1 = _mm_xor_64(mm1, Ax[6][HI(ax)]); \
96     \
97     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
98     mm0 = _mm_xor_64(mm0, Ax[7][LO(ax)]); \
99     mm1 = _mm_xor_64(mm1, Ax[7][HI(ax)]); \
100     \
101     xmm4 = _mm_set_epi64(mm1, mm0); \
102 }
103
104 #define __EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
105     __m128i tmm4; \
106     register unsigned long long r0, r1; \
107     r0  = Ax[0][_mm_extract_epi8(xmm0, row + 0)]; \
108     r0 ^= Ax[1][_mm_extract_epi8(xmm0, row + 8)]; \
109     r0 ^= Ax[2][_mm_extract_epi8(xmm1, row + 0)]; \
110     r0 ^= Ax[3][_mm_extract_epi8(xmm1, row + 8)]; \
111     r0 ^= Ax[4][_mm_extract_epi8(xmm2, row + 0)]; \
112     r0 ^= Ax[5][_mm_extract_epi8(xmm2, row + 8)]; \
113     r0 ^= Ax[6][_mm_extract_epi8(xmm3, row + 0)]; \
114     r0 ^= Ax[7][_mm_extract_epi8(xmm3, row + 8)]; \
115     \
116     r1  = Ax[0][_mm_extract_epi8(xmm0, row + 1)]; \
117     r1 ^= Ax[1][_mm_extract_epi8(xmm0, row + 9)]; \
118     r1 ^= Ax[2][_mm_extract_epi8(xmm1, row + 1)]; \
119     r1 ^= Ax[3][_mm_extract_epi8(xmm1, row + 9)]; \
120     r1 ^= Ax[4][_mm_extract_epi8(xmm2, row + 1)]; \
121     r1 ^= Ax[5][_mm_extract_epi8(xmm2, row + 9)]; \
122     r1 ^= Ax[6][_mm_extract_epi8(xmm3, row + 1)]; \
123     r1 ^= Ax[7][_mm_extract_epi8(xmm3, row + 9)]; \
124     xmm4 = _mm_cvtsi64_si128((long long) r0); \
125     tmm4 = _mm_cvtsi64_si128((long long) r1); \
126     xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
127 }
128
129 #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
130     __m128i tmm4; \
131     register unsigned short ax; \
132     register unsigned long long r0, r1; \
133      \
134     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
135     r0  = Ax[0][LO(ax)]; \
136     r1  = Ax[0][HI(ax)]; \
137     \
138     ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
139     r0 ^= Ax[1][LO(ax)]; \
140     r1 ^= Ax[1][HI(ax)]; \
141     \
142     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
143     r0 ^= Ax[2][LO(ax)]; \
144     r1 ^= Ax[2][HI(ax)]; \
145     \
146     ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
147     r0 ^= Ax[3][LO(ax)]; \
148     r1 ^= Ax[3][HI(ax)]; \
149     \
150     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
151     r0 ^= Ax[4][LO(ax)]; \
152     r1 ^= Ax[4][HI(ax)]; \
153     \
154     ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
155     r0 ^= Ax[5][LO(ax)]; \
156     r1 ^= Ax[5][HI(ax)]; \
157     \
158     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
159     r0 ^= Ax[6][LO(ax)]; \
160     r1 ^= Ax[6][HI(ax)]; \
161     \
162     ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
163     r0 ^= Ax[7][LO(ax)]; \
164     r1 ^= Ax[7][HI(ax)]; \
165     \
166     xmm4 = _mm_cvtsi64_si128((long long) r0); \
167     tmm4 = _mm_cvtsi64_si128((long long) r1); \
168     xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
169 }
170
171 #define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
172     __m128i tmm0, tmm1, tmm2, tmm3; \
173     X128M(P, xmm0, xmm1, xmm2, xmm3); \
174     \
175     EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
176     EXTRACT(1, xmm0, xmm1, xmm2, xmm3, tmm1); \
177     EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm2); \
178     EXTRACT(3, xmm0, xmm1, xmm2, xmm3, tmm3); \
179     \
180     xmm0 = tmm0; \
181     xmm1 = tmm1; \
182     xmm2 = tmm2; \
183     xmm3 = tmm3; \
184 }
185
186 #define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
187     __m128i tmm0, tmm1, tmm2, tmm3; \
188     X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
189     \
190     EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
191     EXTRACT(1, xmm4, xmm5, xmm6, xmm7, tmm1); \
192     EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm2); \
193     EXTRACT(3, xmm4, xmm5, xmm6, xmm7, tmm3); \
194     \
195     xmm4 = tmm0; \
196     xmm5 = tmm1; \
197     xmm6 = tmm2; \
198     xmm7 = tmm3; \
199 }
200
201 #define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
202     XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
203     XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \
204 }