gosthash2012: Improve SIMD implementation

author dmitry dulesov <dmitry.dulesov@gmail.com>

Fri, 31 Jan 2020 23:59:07 +0000 (02:59 +0300)

committer Dmitry Belyavskiy <beldmit@users.noreply.github.com>

Sun, 2 Feb 2020 18:05:29 +0000 (21:05 +0300)
author dmitry dulesov <dmitry.dulesov@gmail.com>
Fri, 31 Jan 2020 23:59:07 +0000 (02:59 +0300)
committer Dmitry Belyavskiy <beldmit@users.noreply.github.com>
Sun, 2 Feb 2020 18:05:29 +0000 (21:05 +0300)
diff --git a/gosthash2012.c b/gosthash2012.c

index d6cde2154a7143371057a530473c057c3f2dbed9..6399a9edd2d5dca686c78434de4ecabe0dc9ef33 100644 (file)
--- a/gosthash2012.c
+++ b/gosthash2012.c
@@ -118,7 +118,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
      LOAD(N, xmm0, xmm2, xmm4, xmm6);
      XLPS128M(h, xmm0, xmm2, xmm4, xmm6);
  
-    LOAD(m, xmm1, xmm3, xmm5, xmm7);
+    ULOAD(m, xmm1, xmm3, xmm5, xmm7);
      XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
  
      for (i = 0; i < 11; i++)
@@ -128,12 +128,10 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
      X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
  
      X128M(h, xmm0, xmm2, xmm4, xmm6);
-    X128M(m, xmm0, xmm2, xmm4, xmm6);
-
-    UNLOAD(h, xmm0, xmm2, xmm4, xmm6);
+    ULOAD(m, xmm1, xmm3, xmm5, xmm7);
+    X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
  
-    /* Restore the Floating-point status on the CPU */
-    _mm_empty();
+    STORE(h, xmm0, xmm2, xmm4, xmm6);
  #else
      union uint512_u Ki, data;
      unsigned int i;
diff --git a/gosthash2012.h b/gosthash2012.h

index 5b1cd9f1f5c61677d31156447543f0fd49cd0c75..8947ee6ab7e5404aff5da98b04d5637dfc770814 100644 (file)
--- a/gosthash2012.h
+++ b/gosthash2012.h
@@ -12,6 +12,17 @@
  
  #ifdef __SSE2__
  # define __GOST3411_HAS_SSE2__
+# if !defined(__x86_64__)
+/*
+ * x86-64 bit Linux and Windows ABIs provide malloc function that returns
+ * 16-byte alignment memory buffers required by SSE load/store instructions.
+ * Other platforms require special trick for proper gost2012_hash_ctx structure
+ * allocation. It will be easier to switch to unaligned loadu/storeu memory
+ * access instructions in this case.
+ */
+#  define UNALIGNED_SIMD_ACCESS
+#  pragma message "Use unaligned SIMD memory access"
+# endif
  #endif
  
  #ifdef __GOST3411_HAS_SSE2__
diff --git a/gosthash2012_sse2.h b/gosthash2012_sse2.h

index f45dab18e241adfc86abf7c02275cb17a25d38ea..188f7b8d12a48bfe1c07fd046f2020004446abac 100644 (file)
--- a/gosthash2012_sse2.h
+++ b/gosthash2012_sse2.h
@@ -16,6 +16,9 @@
  
  #include <mmintrin.h>
  #include <emmintrin.h>
+#ifdef __SSE3__
+# include <pmmintrin.h>
+#endif
  
  #define LO(v) ((unsigned char) (v))
  #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
@@ -31,20 +34,50 @@
  # define _mm_cvtm64_si64(v) (long long) v
  #endif
  
+#ifdef __SSE3__
+/*
+ * "This intrinsic may perform better than _mm_loadu_si128 when
+ * the data crosses a cache line boundary."
+ */
+# define UMEM_READ_I128 _mm_lddqu_si128
+#else /* SSE2 */
+# define UMEM_READ_I128 _mm_loadu_si128
+#endif
+
+/* load 512bit from unaligned memory  */
+#define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \
+    const __m128i *__m128p = (const __m128i *) P; \
+    xmm0 = UMEM_READ_I128(&__m128p[0]); \
+    xmm1 = UMEM_READ_I128(&__m128p[1]); \
+    xmm2 = UMEM_READ_I128(&__m128p[2]); \
+    xmm3 = UMEM_READ_I128(&__m128p[3]); \
+}
+
+#ifdef UNALIGNED_SIMD_ACCESS
+
+# define MEM_WRITE_I128         _mm_storeu_si128
+# define MEM_READ_I128  UMEM_READ_I128
+# define LOAD           ULOAD
+
+#else /* !UNALIGNED_SIMD_ACCESS */
+
+# define MEM_WRITE_I128          _mm_store_si128
+# define MEM_READ_I128  _mm_load_si128
  #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
-    const __m128i *__m128p = (const __m128i *) &P[0]; \
-    xmm0 = _mm_load_si128(&__m128p[0]); \
-    xmm1 = _mm_load_si128(&__m128p[1]); \
-    xmm2 = _mm_load_si128(&__m128p[2]); \
-    xmm3 = _mm_load_si128(&__m128p[3]); \
+    const __m128i *__m128p = (const __m128i *) P; \
+    xmm0 = MEM_READ_I128(&__m128p[0]); \
+    xmm1 = MEM_READ_I128(&__m128p[1]); \
+    xmm2 = MEM_READ_I128(&__m128p[2]); \
+    xmm3 = MEM_READ_I128(&__m128p[3]); \
  }
+#endif /* !UNALIGNED_SIMD_ACCESS */
  
-#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
+#define STORE(P, xmm0, xmm1, xmm2, xmm3) { \
      __m128i *__m128p = (__m128i *) &P[0]; \
-    _mm_store_si128(&__m128p[0], xmm0); \
-    _mm_store_si128(&__m128p[1], xmm1); \
-    _mm_store_si128(&__m128p[2], xmm2); \
-    _mm_store_si128(&__m128p[3], xmm3); \
+    MEM_WRITE_I128(&__m128p[0], xmm0); \
+    MEM_WRITE_I128(&__m128p[1], xmm1); \
+    MEM_WRITE_I128(&__m128p[2], xmm2); \
+    MEM_WRITE_I128(&__m128p[3], xmm3); \
  }
  
  #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
@@ -56,10 +89,10 @@
  
  #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
      const __m128i *__m128p = (const __m128i *) &P[0]; \
-    xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
-    xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
-    xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
-    xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
+    xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \
+    xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \
+    xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \
+    xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \
  }
  
  #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
author	dmitry dulesov <dmitry.dulesov@gmail.com>
	Fri, 31 Jan 2020 23:59:07 +0000 (02:59 +0300)
committer	Dmitry Belyavskiy <beldmit@users.noreply.github.com>
	Sun, 2 Feb 2020 18:05:29 +0000 (21:05 +0300)
gosthash2012.c		patch \| blob \| history
gosthash2012.h		patch \| blob \| history
gosthash2012_sse2.h		patch \| blob \| history