From 6ae982cccf42a54cef60f5948aa46604859b4871 Mon Sep 17 00:00:00 2001 From: Mounir IDRASSI Date: Fri, 17 Jun 2016 14:16:57 +0200 Subject: Update intrinsic support and cpu detection. --- src/Crypto/cpu.c | 8 ++--- src/Crypto/cpu.h | 94 ++++++++++++++++++++++++++++++++++++++++++++++--------- src/Crypto/misc.h | 4 ++- 3 files changed, 87 insertions(+), 19 deletions(-) diff --git a/src/Crypto/cpu.c b/src/Crypto/cpu.c index 3de87069..5f3643de 100644 --- a/src/Crypto/cpu.c +++ b/src/Crypto/cpu.c @@ -12,10 +12,6 @@ #include #endif -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -#include -#endif - #ifdef CRYPTOPP_CPUID_AVAILABLE #if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64 @@ -165,6 +161,7 @@ static int TrySSE2() int g_x86DetectionDone = 0; int g_hasISSE = 0, g_hasSSE2 = 0, g_hasSSSE3 = 0, g_hasMMX = 0, g_hasAESNI = 0, g_hasCLMUL = 0, g_isP4 = 0; +int g_hasAVX = 0, g_hasSSE42 = 0, g_hasSSE41 = 0; uint32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; VC_INLINE int IsIntel(const uint32 output[4]) @@ -194,6 +191,9 @@ void DetectX86Features() g_hasMMX = (cpuid1[3] & (1 << 23)) != 0; if ((cpuid1[3] & (1 << 26)) != 0) g_hasSSE2 = TrySSE2(); + g_hasAVX = g_hasSSE2 && (cpuid1[2] & (1 << 28)); + g_hasSSE42 = g_hasSSE2 && (cpuid1[2] & (1 << 20)); + g_hasSSE41 = g_hasSSE2 && (cpuid1[2] & (1 << 19)); g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9)); g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25)); g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1)); diff --git a/src/Crypto/cpu.h b/src/Crypto/cpu.h index 44da8cc3..d6015e1c 100644 --- a/src/Crypto/cpu.h +++ b/src/Crypto/cpu.h @@ -4,6 +4,10 @@ #include "Common/Tcdefs.h" #include "config.h" +#if defined(__cplusplus) +extern "C" { +#endif + #ifdef CRYPTOPP_GENERATE_X64_MASM #define CRYPTOPP_X86_ASM_AVAILABLE @@ -13,30 +17,90 @@ #else #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#if defined(TC_WINDOWS_DRIVER) +typedef union __declspec(intrin_type) CRYPTOPP_ALIGN_DATA(8) __m64 +{ + unsigned __int64 m64_u64; + float m64_f32[2]; + __int8 m64_i8[8]; + __int16 m64_i16[4]; + __int32 m64_i32[2]; + __int64 m64_i64; + unsigned __int8 m64_u8[8]; + unsigned __int16 m64_u16[4]; + unsigned __int32 m64_u32[2]; +} __m64; + +typedef union __declspec(intrin_type) CRYPTOPP_ALIGN_DATA(16) __m128 { + float m128_f32[4]; + unsigned __int64 m128_u64[2]; + __int8 m128_i8[16]; + __int16 m128_i16[8]; + __int32 m128_i32[4]; + __int64 m128_i64[2]; + unsigned __int8 m128_u8[16]; + unsigned __int16 m128_u16[8]; + unsigned __int32 m128_u32[4]; + } __m128; + +typedef union __declspec(intrin_type) CRYPTOPP_ALIGN_DATA(16) __m128i { + __int8 m128i_i8[16]; + __int16 m128i_i16[8]; + __int32 m128i_i32[4]; + __int64 m128i_i64[2]; + unsigned __int8 m128i_u8[16]; + unsigned __int16 m128i_u16[8]; + unsigned __int32 m128i_u32[4]; + unsigned __int64 m128i_u64[2]; +} __m128i; + +typedef struct __declspec(intrin_type) CRYPTOPP_ALIGN_DATA(16) __m128d { + double m128d_f64[2]; +} __m128d; + +#define _MM_SHUFFLE2(x,y) (((x)<<1) | (y)) + +extern void _m_empty(void); +extern int _mm_extract_epi16(__m128i _A, int _Imm); +extern __m128i _mm_load_si128(__m128i const*_P); +extern __m128i _mm_xor_si128(__m128i _A, __m128i _B); +extern __m128i _mm_cvtsi64_si128(__int64); +extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B); +extern void _mm_store_si128(__m128i *_P, __m128i _B); +extern __m64 _m_pxor(__m64 _MM1, __m64 _MM2); +extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0); +#define _mm_xor_si64 _m_pxor +#define _mm_empty _m_empty +#else +#include #include #endif +#endif #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE #if defined(__SSSE3__) || defined(__INTEL_COMPILER) -#ifdef TC_WINDOWS_DRIVER +#if defined(TC_WINDOWS_DRIVER) extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b); #else #include #endif #endif -#if defined(__SSE4_1__) || defined(__INTEL_COMPILER) -#ifdef TC_WINDOWS_DRIVER +#if defined(__SSE4_1__) || defined(__INTEL_COMPILER) || defined(_MSC_VER) +#if defined(TC_WINDOWS_DRIVER) extern int _mm_extract_epi32(__m128i src, const int ndx); extern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx); +#if defined(_M_X64) +extern __m128i _mm_insert_epi64(__m128i dst, __int64 s, const int ndx); +#endif #else #include #endif #endif -#if (defined(__AES__) && defined(__PCLMUL__)) || defined(__INTEL_COMPILER) -#ifdef TC_WINDOWS_DRIVER -extern __m128i _mm_clmulepi64_si128(__m128i v1, __m128i v2, +#if (defined(__AES__) && defined(__PCLMUL__)) || defined(__INTEL_COMPILER) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if defined(TC_WINDOWS_DRIVER) +extern __m128i _mm_clmulepi64_si128(__m128i v1, __m128i v2, const int imm8); extern __m128i _mm_aeskeygenassist_si128(__m128i ckey, const int rcon); extern __m128i _mm_aesimc_si128(__m128i v); @@ -54,12 +118,11 @@ extern __m128i _mm_aesdeclast_si128(__m128i v, __m128i rkey); #define CRYPTOPP_CPUID_AVAILABLE -#if defined(__cplusplus) -extern "C" { -#endif - // these should not be used directly extern int g_x86DetectionDone; +extern int g_hasAVX; +extern int g_hasSSE42; +extern int g_hasSSE41; extern int g_hasSSSE3; extern int g_hasAESNI; extern int g_hasCLMUL; @@ -84,16 +147,15 @@ extern int g_hasMMX; #endif +#define HasSSE42() g_hasSSE42 +#define HasSSE41() g_hasSSE41 +#define HasSAVX() g_hasAVX #define HasSSSE3() g_hasSSSE3 #define HasAESNI() g_hasAESNI #define HasCLMUL() g_hasCLMUL #define IsP4() g_isP4 #define GetCacheLineSize() g_cacheLineSize -#if defined(__cplusplus) -} -#endif - #else #define GetCacheLineSize() CRYPTOPP_L1_CACHE_LINE_SIZE @@ -305,4 +367,8 @@ extern int g_hasMMX; AS2( add outputPtr, increment*16) +#if defined(__cplusplus) +} +#endif + #endif diff --git a/src/Crypto/misc.h b/src/Crypto/misc.h index 2b4e9089..75ecedcd 100644 --- a/src/Crypto/misc.h +++ b/src/Crypto/misc.h @@ -87,10 +87,12 @@ #define bswap_32 OSSwapInt32 #define bswap_64 OSSwapInt64 #else -#ifdef CRYPTOPP_FAST_ROTATE(32) +#if CRYPTOPP_FAST_ROTATE(32) #define bswap_32(x) (rotr32((x), 8U) & 0xff00ff00) | (rotl32((x), 8U) & 0x00ff00ff) #else +#define CRYPTOPP_BYTESWAP_AVAILABLE #define bswap_32(x) (rotl32((((x) & 0xFF00FF00) >> 8) | (((x) & 0x00FF00FF) << 8), 16U)) +#define bswap_64(x) rotl64(((((((x & LL(0xFF00FF00FF00FF00)) >> 8) | ((x & LL(0x00FF00FF00FF00FF)) << 8)) & LL(0xFFFF0000FFFF0000)) >> 16) | (((((x & LL(0xFF00FF00FF00FF00)) >> 8) | ((x & LL(0x00FF00FF00FF00FF)) << 8)) & LL(0x0000FFFF0000FFFF)) << 16)), 32U) #endif #ifndef TC_NO_COMPILER_INT64 #define bswap_64(x) rotl64(((((((x & LL(0xFF00FF00FF00FF00)) >> 8) | ((x & LL(0x00FF00FF00FF00FF)) << 8)) & LL(0xFFFF0000FFFF0000)) >> 16) | (((((x & LL(0xFF00FF00FF00FF00)) >> 8) | ((x & LL(0x00FF00FF00FF00FF)) << 8)) & LL(0x0000FFFF0000FFFF)) << 16)), 32U) -- cgit v1.2.3