VeraCrypt
aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Crypto/SerpentFast.c318
-rw-r--r--src/Crypto/SerpentFast.h28
-rw-r--r--src/Crypto/SerpentFast_sbox.h437
-rw-r--r--src/Crypto/SerpentFast_simd.cpp338
4 files changed, 1121 insertions, 0 deletions
diff --git a/src/Crypto/SerpentFast.c b/src/Crypto/SerpentFast.c
new file mode 100644
index 00000000..02777a8f
--- /dev/null
+++ b/src/Crypto/SerpentFast.c
@@ -0,0 +1,318 @@
+/*
+* Serpent
+* (C) 1999-2007 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include "SerpentFast.h"
+#include "SerpentFast_sbox.h"
+#include "Common/Endian.h"
+#if !defined(_UEFI)
+#include <memory.h>
+#include <stdlib.h>
+#endif
+#include "cpu.h"
+#include "misc.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+#define BOTAN_ENDIAN_N2B(x) (x)
+#define BOTAN_ENDIAN_B2N(x) (x)
+
+#define BOTAN_ENDIAN_N2L(x) bswap_32(x)
+#define BOTAN_ENDIAN_L2N(x) bswap_32(x)
+
+#elif BYTE_ORDER == LITTLE_ENDIAN
+
+#define BOTAN_ENDIAN_N2L(x) (x)
+#define BOTAN_ENDIAN_L2N(x) (x)
+
+#define BOTAN_ENDIAN_N2B(x) bswap_32(x)
+#define BOTAN_ENDIAN_B2N(x) bswap_32(x)
+
+#endif
+
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+extern void serpent_simd_encrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key);
+extern void serpent_simd_decrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key);
+#endif
+
+/*
+* Serpent's Linear Transform
+*/
+#define transform(B0,B1,B2,B3) \
+ do { \
+ B0 = rotl32(B0, 13); B2 = rotl32(B2, 3); \
+ B1 ^= B0 ^ B2; B3 ^= B2 ^ (B0 << 3); \
+ B1 = rotl32(B1, 1); B3 = rotl32(B3, 7); \
+ B0 ^= B1 ^ B3; B2 ^= B3 ^ (B1 << 7); \
+ B0 = rotl32(B0, 5); B2 = rotl32(B2, 22); \
+ } while (0);
+
+/*
+* Serpent's Inverse Linear Transform
+*/
+#define i_transform(B0,B1,B2,B3) \
+ do { \
+ B2 = rotr32(B2, 22); B0 = rotr32(B0, 5); \
+ B2 ^= B3 ^ (B1 << 7); B0 ^= B1 ^ B3; \
+ B3 = rotr32(B3, 7); B1 = rotr32(B1, 1); \
+ B3 ^= B2 ^ (B0 << 3); B1 ^= B0 ^ B2; \
+ B2 = rotr32(B2, 3); B0 = rotr32(B0, 13); \
+ } while (0);
+
+
+/*
+* XOR a key block with a data block
+*/
+#define key_xor(round, B0, B1, B2, B3) \
+ B0 ^= round_key[4*round ]; \
+ B1 ^= round_key[4*round+1]; \
+ B2 ^= round_key[4*round+2]; \
+ B3 ^= round_key[4*round+3];
+
+/*
+* Serpent Encryption
+*/
+void serpent_encrypt_blocks(const unsigned __int8* in, unsigned __int8* out, size_t blocks, unsigned __int8 *ks)
+{
+ unsigned __int32 B0, B1, B2, B3;
+ unsigned __int32* round_key = ((unsigned __int32*) ks) + 8;
+ size_t i;
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+ if(HasSSE2() && (blocks >= 4))
+ {
+ CRYPTOPP_ALIGN_DATA(16) unsigned __int8 alignedInputBuffer[4 * 16];
+ CRYPTOPP_ALIGN_DATA(16) unsigned __int8 alignedOutputBuffer[4 * 16];
+ unsigned __int8 *inPtr, *outPtr;
+ BOOL inputAligned = IsAligned16(in)? TRUE : FALSE;
+ BOOL outputAligned = IsAligned16(out)? TRUE : FALSE;
+
+ while(blocks >= 4)
+ {
+ if (inputAligned)
+ inPtr = (unsigned __int8 *) in;
+ else
+ {
+ inPtr = alignedInputBuffer;
+ memcpy (inPtr, in, 4 * 16);
+ }
+ if (out == in)
+ outPtr = inPtr;
+ else if (outputAligned)
+ outPtr = (unsigned __int8 *) out;
+ else
+ {
+ outPtr = alignedOutputBuffer;
+ memcpy (outPtr, out, 4 * 16);
+ }
+ serpent_simd_encrypt_blocks_4(inPtr, outPtr, round_key);
+ in += 4 * 16;
+ out += 4 * 16;
+ blocks -= 4;
+ }
+ }
+#endif
+
+ for(i = 0; i != blocks; ++i)
+ {
+ memcpy (&B0, in + 0, 4);
+ memcpy (&B1, in + 4, 4);
+ memcpy (&B2, in + 8, 4);
+ memcpy (&B3, in + 12, 4);
+ B0 = BOTAN_ENDIAN_N2L (B0);
+ B1 = BOTAN_ENDIAN_N2L (B1);
+ B2 = BOTAN_ENDIAN_N2L (B2);
+ B3 = BOTAN_ENDIAN_N2L (B3);
+
+ key_xor( 0,B0,B1,B2,B3); SBoxE1(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 1,B0,B1,B2,B3); SBoxE2(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 2,B0,B1,B2,B3); SBoxE3(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 3,B0,B1,B2,B3); SBoxE4(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 4,B0,B1,B2,B3); SBoxE5(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 5,B0,B1,B2,B3); SBoxE6(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 6,B0,B1,B2,B3); SBoxE7(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 7,B0,B1,B2,B3); SBoxE8(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 8,B0,B1,B2,B3); SBoxE1(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 9,B0,B1,B2,B3); SBoxE2(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(10,B0,B1,B2,B3); SBoxE3(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(11,B0,B1,B2,B3); SBoxE4(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(12,B0,B1,B2,B3); SBoxE5(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(13,B0,B1,B2,B3); SBoxE6(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(14,B0,B1,B2,B3); SBoxE7(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(15,B0,B1,B2,B3); SBoxE8(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(16,B0,B1,B2,B3); SBoxE1(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(17,B0,B1,B2,B3); SBoxE2(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(18,B0,B1,B2,B3); SBoxE3(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(19,B0,B1,B2,B3); SBoxE4(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(20,B0,B1,B2,B3); SBoxE5(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(21,B0,B1,B2,B3); SBoxE6(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(22,B0,B1,B2,B3); SBoxE7(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(23,B0,B1,B2,B3); SBoxE8(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(24,B0,B1,B2,B3); SBoxE1(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(25,B0,B1,B2,B3); SBoxE2(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(26,B0,B1,B2,B3); SBoxE3(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(27,B0,B1,B2,B3); SBoxE4(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(28,B0,B1,B2,B3); SBoxE5(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(29,B0,B1,B2,B3); SBoxE6(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(30,B0,B1,B2,B3); SBoxE7(unsigned __int32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(31,B0,B1,B2,B3); SBoxE8(unsigned __int32,B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
+
+ B0 = BOTAN_ENDIAN_L2N(B0);
+ B1 = BOTAN_ENDIAN_L2N(B1);
+ B2 = BOTAN_ENDIAN_L2N(B2);
+ B3 = BOTAN_ENDIAN_L2N(B3);
+ memcpy(out + 0, &B0, 4);
+ memcpy(out + 4, &B1, 4);
+ memcpy(out + 8, &B2, 4);
+ memcpy(out + 12, &B3, 4);
+
+ in += 16;
+ out += 16;
+ }
+}
+
+/*
+* Serpent Decryption
+*/
+void serpent_decrypt_blocks(const unsigned __int8* in, unsigned __int8* out, size_t blocks, unsigned __int8 *ks)
+{
+ unsigned __int32 B0, B1, B2, B3;
+ unsigned __int32* round_key = ((unsigned __int32*) ks) + 8;
+ size_t i;
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+ if(HasSSE2() && (blocks >= 4))
+ {
+ CRYPTOPP_ALIGN_DATA(16) unsigned __int8 alignedInputBuffer[4 * 16];
+ CRYPTOPP_ALIGN_DATA(16) unsigned __int8 alignedOutputBuffer[4 * 16];
+ unsigned __int8 *inPtr, *outPtr;
+ BOOL inputAligned = IsAligned16(in)? TRUE : FALSE;
+ BOOL outputAligned = IsAligned16(out)? TRUE : FALSE;
+
+ while(blocks >= 4)
+ {
+ if (inputAligned)
+ inPtr = (unsigned __int8 *) in;
+ else
+ {
+ inPtr = alignedInputBuffer;
+ memcpy (inPtr, in, 4 * 16);
+ }
+ if (out == in)
+ outPtr = inPtr;
+ else if (outputAligned)
+ outPtr = (unsigned __int8 *) out;
+ else
+ {
+ outPtr = alignedOutputBuffer;
+ memcpy (outPtr, out, 4 * 16);
+ }
+ serpent_simd_decrypt_blocks_4(inPtr, outPtr, round_key);
+ in += 4 * 16;
+ out += 4 * 16;
+ blocks -= 4;
+ }
+ }
+#endif
+
+ for(i = 0; i != blocks; ++i)
+ {
+ memcpy (&B0, in + 0, 4);
+ memcpy (&B1, in + 4, 4);
+ memcpy (&B2, in + 8, 4);
+ memcpy (&B3, in + 12, 4);
+ B0 = BOTAN_ENDIAN_N2L (B0);
+ B1 = BOTAN_ENDIAN_N2L (B1);
+ B2 = BOTAN_ENDIAN_N2L (B2);
+ B3 = BOTAN_ENDIAN_N2L (B3);
+
+ key_xor(32,B0,B1,B2,B3); SBoxD8(unsigned __int32,B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(unsigned __int32,B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(unsigned __int32,B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(unsigned __int32,B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(unsigned __int32,B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(unsigned __int32,B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(unsigned __int32,B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(unsigned __int32,B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD8(unsigned __int32,B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(unsigned __int32,B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(unsigned __int32,B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(unsigned __int32,B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(unsigned __int32,B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(unsigned __int32,B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(unsigned __int32,B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(unsigned __int32,B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD8(unsigned __int32,B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(unsigned __int32,B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(unsigned __int32,B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(unsigned __int32,B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(unsigned __int32,B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(unsigned __int32,B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(unsigned __int32,B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(unsigned __int32,B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD8(unsigned __int32,B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(unsigned __int32,B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(unsigned __int32,B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(unsigned __int32,B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(unsigned __int32,B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(unsigned __int32,B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(unsigned __int32,B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(unsigned __int32,B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
+
+ B0 = BOTAN_ENDIAN_L2N(B0);
+ B1 = BOTAN_ENDIAN_L2N(B1);
+ B2 = BOTAN_ENDIAN_L2N(B2);
+ B3 = BOTAN_ENDIAN_L2N(B3);
+ memcpy(out + 0, &B0, 4);
+ memcpy(out + 4, &B1, 4);
+ memcpy(out + 8, &B2, 4);
+ memcpy(out + 12, &B3, 4);
+
+ in += 16;
+ out += 16;
+ }
+}
+
+#undef key_xor
+#undef transform
+#undef i_transform
+
+/*
+* Serpent Key Schedule
+*/
+void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks)
+{
+ const unsigned __int32 PHI = 0x9E3779B9;
+ unsigned __int32* W = (unsigned __int32*) ks;
+ int i;
+ for(i = 0; i != 8; ++i)
+ {
+ memcpy (W + i, userKey + (i*4), 4);
+ W[i] = BOTAN_ENDIAN_N2L(W[i]);
+ }
+
+ for(i = 8; i != 140; ++i)
+ {
+ unsigned __int32 wi = W[i-8] ^ W[i-5] ^ W[i-3] ^ W[i-1] ^ PHI ^ (unsigned __int32)(i-8);
+ W[i] = rotl32(wi, 11);
+ }
+
+ SBoxE4(unsigned __int32,W[ 8],W[ 9],W[ 10],W[ 11]); SBoxE3(unsigned __int32,W[ 12],W[ 13],W[ 14],W[ 15]);
+ SBoxE2(unsigned __int32,W[ 16],W[ 17],W[ 18],W[ 19]); SBoxE1(unsigned __int32,W[ 20],W[ 21],W[ 22],W[ 23]);
+ SBoxE8(unsigned __int32,W[ 24],W[ 25],W[ 26],W[ 27]); SBoxE7(unsigned __int32,W[ 28],W[ 29],W[ 30],W[ 31]);
+ SBoxE6(unsigned __int32,W[ 32],W[ 33],W[ 34],W[ 35]); SBoxE5(unsigned __int32,W[ 36],W[ 37],W[ 38],W[ 39]);
+ SBoxE4(unsigned __int32,W[ 40],W[ 41],W[ 42],W[ 43]); SBoxE3(unsigned __int32,W[ 44],W[ 45],W[ 46],W[ 47]);
+ SBoxE2(unsigned __int32,W[ 48],W[ 49],W[ 50],W[ 51]); SBoxE1(unsigned __int32,W[ 52],W[ 53],W[ 54],W[ 55]);
+ SBoxE8(unsigned __int32,W[ 56],W[ 57],W[ 58],W[ 59]); SBoxE7(unsigned __int32,W[ 60],W[ 61],W[ 62],W[ 63]);
+ SBoxE6(unsigned __int32,W[ 64],W[ 65],W[ 66],W[ 67]); SBoxE5(unsigned __int32,W[ 68],W[ 69],W[ 70],W[ 71]);
+ SBoxE4(unsigned __int32,W[ 72],W[ 73],W[ 74],W[ 75]); SBoxE3(unsigned __int32,W[ 76],W[ 77],W[ 78],W[ 79]);
+ SBoxE2(unsigned __int32,W[ 80],W[ 81],W[ 82],W[ 83]); SBoxE1(unsigned __int32,W[ 84],W[ 85],W[ 86],W[ 87]);
+ SBoxE8(unsigned __int32,W[ 88],W[ 89],W[ 90],W[ 91]); SBoxE7(unsigned __int32,W[ 92],W[ 93],W[ 94],W[ 95]);
+ SBoxE6(unsigned __int32,W[ 96],W[ 97],W[ 98],W[ 99]); SBoxE5(unsigned __int32,W[100],W[101],W[102],W[103]);
+ SBoxE4(unsigned __int32,W[104],W[105],W[106],W[107]); SBoxE3(unsigned __int32,W[108],W[109],W[110],W[111]);
+ SBoxE2(unsigned __int32,W[112],W[113],W[114],W[115]); SBoxE1(unsigned __int32,W[116],W[117],W[118],W[119]);
+ SBoxE8(unsigned __int32,W[120],W[121],W[122],W[123]); SBoxE7(unsigned __int32,W[124],W[125],W[126],W[127]);
+ SBoxE6(unsigned __int32,W[128],W[129],W[130],W[131]); SBoxE5(unsigned __int32,W[132],W[133],W[134],W[135]);
+ SBoxE4(unsigned __int32,W[136],W[137],W[138],W[139]);
+}
diff --git a/src/Crypto/SerpentFast.h b/src/Crypto/SerpentFast.h
new file mode 100644
index 00000000..a5937b6e
--- /dev/null
+++ b/src/Crypto/SerpentFast.h
@@ -0,0 +1,28 @@
+/*
+* Serpent
+* (C) 1999-2007 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include "Common/Tcdefs.h"
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* userKey is always 32-bytes long */
+void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks);
+void serpent_encrypt_blocks(const unsigned __int8* in, unsigned __int8* out, size_t blocks, unsigned __int8 *ks);
+void serpent_decrypt_blocks(const unsigned __int8* in, unsigned __int8* out, size_t blocks, unsigned __int8 *ks);
+
+#define serpent_encrypt(inBlock,outBlock,ks) serpent_encrypt_blocks(inBlock,outBlock,1,ks)
+#define serpent_decrypt(inBlock,outBlock,ks) serpent_decrypt_blocks(inBlock,outBlock,1,ks)
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/Crypto/SerpentFast_sbox.h b/src/Crypto/SerpentFast_sbox.h
new file mode 100644
index 00000000..b5c4b21f
--- /dev/null
+++ b/src/Crypto/SerpentFast_sbox.h
@@ -0,0 +1,437 @@
+/*
+* Serpent SBox Expressions
+* (C) 1999-2007,2013 Jack Lloyd
+*
+* The sbox expressions used here were discovered by Dag Arne Osvik and
+* are described in his paper "Speeding Up Serpent".
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_SERPENT_SBOX_H__
+#define BOTAN_SERPENT_SBOX_H__
+
+#define SBoxE1(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B3 ^= B0; \
+ B4 = B1; \
+ B1 &= B3; \
+ B4 ^= B2; \
+ B1 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B4; \
+ B4 ^= B3; \
+ B3 ^= B2; \
+ B2 |= B1; \
+ B2 ^= B4; \
+ B4 = ~B4; \
+ B4 |= B1; \
+ B1 ^= B3; \
+ B1 ^= B4; \
+ B3 |= B0; \
+ B1 ^= B3; \
+ B4 ^= B3; \
+ B3 = B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE2(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B0 = ~B0; \
+ B2 = ~B2; \
+ B4 = B0; \
+ B0 &= B1; \
+ B2 ^= B0; \
+ B0 |= B3; \
+ B3 ^= B2; \
+ B1 ^= B0; \
+ B0 ^= B4; \
+ B4 |= B1; \
+ B1 ^= B3; \
+ B2 |= B0; \
+ B2 &= B4; \
+ B0 ^= B1; \
+ B1 &= B2; \
+ B1 ^= B0; \
+ B0 &= B2; \
+ B4 ^= B0; \
+ B0 = B2; \
+ B2 = B3; \
+ B3 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE3(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B0; \
+ B0 &= B2; \
+ B0 ^= B3; \
+ B2 ^= B1; \
+ B2 ^= B0; \
+ B3 |= B4; \
+ B3 ^= B1; \
+ B4 ^= B2; \
+ B1 = B3; \
+ B3 |= B4; \
+ B3 ^= B0; \
+ B0 &= B1; \
+ B4 ^= B0; \
+ B1 ^= B3; \
+ B1 ^= B4; \
+ B0 = B2; \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = ~B4; \
+ } while(0);
+
+#define SBoxE4(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B0; \
+ B0 |= B3; \
+ B3 ^= B1; \
+ B1 &= B4; \
+ B4 ^= B2; \
+ B2 ^= B3; \
+ B3 &= B0; \
+ B4 |= B1; \
+ B3 ^= B4; \
+ B0 ^= B1; \
+ B4 &= B0; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B1 |= B0; \
+ B1 ^= B2; \
+ B0 ^= B3; \
+ B2 = B1; \
+ B1 |= B3; \
+ B0 ^= B1; \
+ B1 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxE5(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B1 ^= B3; \
+ B3 = ~B3; \
+ B2 ^= B3; \
+ B3 ^= B0; \
+ B4 = B1; \
+ B1 &= B3; \
+ B1 ^= B2; \
+ B4 ^= B3; \
+ B0 ^= B4; \
+ B2 &= B4; \
+ B2 ^= B0; \
+ B0 &= B1; \
+ B3 ^= B0; \
+ B4 |= B1; \
+ B4 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B2; \
+ B2 &= B3; \
+ B0 = ~B0; \
+ B4 ^= B2; \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxE6(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B0 ^= B1; \
+ B1 ^= B3; \
+ B3 = ~B3; \
+ B4 = B1; \
+ B1 &= B0; \
+ B2 ^= B3; \
+ B1 ^= B2; \
+ B2 |= B4; \
+ B4 ^= B3; \
+ B3 &= B1; \
+ B3 ^= B0; \
+ B4 ^= B1; \
+ B4 ^= B2; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B2 = ~B2; \
+ B0 ^= B4; \
+ B4 |= B3; \
+ B4 ^= B2; \
+ B2 = B0; \
+ B0 = B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxE7(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B2 = ~B2; \
+ B4 = B3; \
+ B3 &= B0; \
+ B0 ^= B4; \
+ B3 ^= B2; \
+ B2 |= B4; \
+ B1 ^= B3; \
+ B2 ^= B0; \
+ B0 |= B1; \
+ B2 ^= B1; \
+ B4 ^= B0; \
+ B0 |= B3; \
+ B0 ^= B2; \
+ B4 ^= B3; \
+ B4 ^= B0; \
+ B3 = ~B3; \
+ B2 &= B4; \
+ B3 ^= B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxE8(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B1; \
+ B1 |= B2; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B2 ^= B1; \
+ B3 |= B4; \
+ B3 &= B0; \
+ B4 ^= B2; \
+ B3 ^= B1; \
+ B1 |= B4; \
+ B1 ^= B0; \
+ B0 |= B4; \
+ B0 ^= B2; \
+ B1 ^= B4; \
+ B2 ^= B1; \
+ B1 &= B0; \
+ B1 ^= B4; \
+ B2 = ~B2; \
+ B2 |= B0; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B3; \
+ B3 = B0; \
+ B0 = B4; \
+ } while(0);
+
+#define SBoxD1(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B2 = ~B2; \
+ B4 = B1; \
+ B1 |= B0; \
+ B4 = ~B4; \
+ B1 ^= B2; \
+ B2 |= B4; \
+ B1 ^= B3; \
+ B0 ^= B4; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B4 ^= B0; \
+ B0 |= B1; \
+ B0 ^= B2; \
+ B3 ^= B4; \
+ B2 ^= B1; \
+ B3 ^= B0; \
+ B3 ^= B1; \
+ B2 &= B3; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxD2(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B1; \
+ B1 ^= B3; \
+ B3 &= B1; \
+ B4 ^= B2; \
+ B3 ^= B0; \
+ B0 |= B1; \
+ B2 ^= B3; \
+ B0 ^= B4; \
+ B0 |= B2; \
+ B1 ^= B3; \
+ B0 ^= B1; \
+ B1 |= B3; \
+ B1 ^= B0; \
+ B4 = ~B4; \
+ B4 ^= B1; \
+ B1 |= B0; \
+ B1 ^= B0; \
+ B1 |= B4; \
+ B3 ^= B1; \
+ B1 = B0; \
+ B0 = B4; \
+ B4 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD3(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B2 ^= B3; \
+ B3 ^= B0; \
+ B4 = B3; \
+ B3 &= B2; \
+ B3 ^= B1; \
+ B1 |= B2; \
+ B1 ^= B4; \
+ B4 &= B3; \
+ B2 ^= B3; \
+ B4 &= B0; \
+ B4 ^= B2; \
+ B2 &= B1; \
+ B2 |= B0; \
+ B3 = ~B3; \
+ B2 ^= B3; \
+ B0 ^= B3; \
+ B0 &= B1; \
+ B3 ^= B4; \
+ B3 ^= B0; \
+ B0 = B1; \
+ B1 = B4; \
+ } while(0);
+
+#define SBoxD4(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B2; \
+ B2 ^= B1; \
+ B0 ^= B2; \
+ B4 &= B2; \
+ B4 ^= B0; \
+ B0 &= B1; \
+ B1 ^= B3; \
+ B3 |= B4; \
+ B2 ^= B3; \
+ B0 ^= B3; \
+ B1 ^= B4; \
+ B3 &= B2; \
+ B3 ^= B1; \
+ B1 ^= B0; \
+ B1 |= B2; \
+ B0 ^= B3; \
+ B1 ^= B4; \
+ B0 ^= B1; \
+ B4 = B0; \
+ B0 = B2; \
+ B2 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD5(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B2; \
+ B2 &= B3; \
+ B2 ^= B1; \
+ B1 |= B3; \
+ B1 &= B0; \
+ B4 ^= B2; \
+ B4 ^= B1; \
+ B1 &= B2; \
+ B0 = ~B0; \
+ B3 ^= B4; \
+ B1 ^= B3; \
+ B3 &= B0; \
+ B3 ^= B2; \
+ B0 ^= B1; \
+ B2 &= B0; \
+ B3 ^= B0; \
+ B2 ^= B4; \
+ B2 |= B3; \
+ B3 ^= B0; \
+ B2 ^= B1; \
+ B1 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#define SBoxD6(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B1 = ~B1; \
+ B4 = B3; \
+ B2 ^= B1; \
+ B3 |= B0; \
+ B3 ^= B2; \
+ B2 |= B1; \
+ B2 &= B0; \
+ B4 ^= B3; \
+ B2 ^= B4; \
+ B4 |= B0; \
+ B4 ^= B1; \
+ B1 &= B2; \
+ B1 ^= B3; \
+ B4 ^= B2; \
+ B3 &= B4; \
+ B4 ^= B1; \
+ B3 ^= B4; \
+ B4 = ~B4; \
+ B3 ^= B0; \
+ B0 = B1; \
+ B1 = B4; \
+ B4 = B3; \
+ B3 = B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxD7(T, B0, B1, B2, B3) \
+ do { \
+ T B4; \
+ B0 ^= B2; \
+ B4 = B2; \
+ B2 &= B0; \
+ B4 ^= B3; \
+ B2 = ~B2; \
+ B3 ^= B1; \
+ B2 ^= B3; \
+ B4 |= B0; \
+ B0 ^= B2; \
+ B3 ^= B4; \
+ B4 ^= B1; \
+ B1 &= B3; \
+ B1 ^= B0; \
+ B0 ^= B3; \
+ B0 |= B2; \
+ B3 ^= B1; \
+ B4 ^= B0; \
+ B0 = B1; \
+ B1 = B2; \
+ B2 = B4; \
+ } while(0);
+
+#define SBoxD8(T, B0, B1, B2, B3) \
+ do { \
+ T B4 = B2; \
+ B2 ^= B0; \
+ B0 &= B3; \
+ B4 |= B3; \
+ B2 = ~B2; \
+ B3 ^= B1; \
+ B1 |= B0; \
+ B0 ^= B2; \
+ B2 &= B4; \
+ B3 &= B4; \
+ B1 ^= B2; \
+ B2 ^= B0; \
+ B0 |= B2; \
+ B4 ^= B1; \
+ B0 ^= B3; \
+ B3 ^= B4; \
+ B4 |= B0; \
+ B3 ^= B2; \
+ B4 ^= B2; \
+ B2 = B1; \
+ B1 = B0; \
+ B0 = B3; \
+ B3 = B4; \
+ } while(0);
+
+#endif
diff --git a/src/Crypto/SerpentFast_simd.cpp b/src/Crypto/SerpentFast_simd.cpp
new file mode 100644
index 00000000..d5d5d65b
--- /dev/null
+++ b/src/Crypto/SerpentFast_simd.cpp
@@ -0,0 +1,338 @@
+/*
+* Serpent (SIMD)
+* (C) 2009,2013 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#include "SerpentFast.h"
+#include "SerpentFast_sbox.h"
+#if !defined(_UEFI)
+#include <memory.h>
+#include <stdlib.h>
+#endif
+#include "cpu.h"
+#include "misc.h"
+
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+
+/**
+* This class is not a general purpose SIMD type, and only offers
+* instructions needed for evaluation of specific crypto primitives.
+* For example it does not currently have equality operators of any
+* kind.
+*/
+class SIMD_4x32
+{
+public:
+
+ SIMD_4x32() // zero initialized
+ {
+ ::memset(&m_reg, 0, sizeof(m_reg));
+ }
+
+ explicit SIMD_4x32(const unsigned __int32 B[4])
+ {
+ m_reg = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B));
+ }
+
+ SIMD_4x32(unsigned __int32 B0, unsigned __int32 B1, unsigned __int32 B2, unsigned __int32 B3)
+ {
+ m_reg = _mm_set_epi32(B0, B1, B2, B3);
+ }
+
+ explicit SIMD_4x32(unsigned __int32 B)
+ {
+ m_reg = _mm_set1_epi32(B);
+ }
+
+ static SIMD_4x32 load_le(const void* in)
+ {
+ return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
+ }
+
+ static SIMD_4x32 load_be(const void* in)
+ {
+ return load_le(in).bswap();
+ }
+
+ void store_le(unsigned __int8 out[]) const
+ {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_reg);
+ }
+
+ void store_be(unsigned __int8 out[]) const
+ {
+ bswap().store_le(out);
+ }
+
+ void rotate_left(size_t rot)
+ {
+ m_reg = _mm_or_si128(_mm_slli_epi32(m_reg, static_cast<int>(rot)),
+ _mm_srli_epi32(m_reg, static_cast<int>(32-rot)));
+
+ }
+
+ void rotate_right(size_t rot)
+ {
+ rotate_left(32 - rot);
+ }
+
+ void operator+=(const SIMD_4x32& other)
+ {
+ m_reg = _mm_add_epi32(m_reg, other.m_reg);
+ }
+
+ SIMD_4x32 operator+(const SIMD_4x32& other) const
+ {
+ return SIMD_4x32(_mm_add_epi32(m_reg, other.m_reg));
+ }
+
+ void operator-=(const SIMD_4x32& other)
+ {
+ m_reg = _mm_sub_epi32(m_reg, other.m_reg);
+ }
+
+ SIMD_4x32 operator-(const SIMD_4x32& other) const
+ {
+ return SIMD_4x32(_mm_sub_epi32(m_reg, other.m_reg));
+ }
+
+ void operator^=(const SIMD_4x32& other)
+ {
+ m_reg = _mm_xor_si128(m_reg, other.m_reg);
+ }
+
+ SIMD_4x32 operator^(const SIMD_4x32& other) const
+ {
+ return SIMD_4x32(_mm_xor_si128(m_reg, other.m_reg));
+ }
+
+ void operator|=(const SIMD_4x32& other)
+ {
+ m_reg = _mm_or_si128(m_reg, other.m_reg);
+ }
+
+ SIMD_4x32 operator&(const SIMD_4x32& other)
+ {
+ return SIMD_4x32(_mm_and_si128(m_reg, other.m_reg));
+ }
+
+ void operator&=(const SIMD_4x32& other)
+ {
+ m_reg = _mm_and_si128(m_reg, other.m_reg);
+ }
+
+ SIMD_4x32 operator<<(size_t shift) const
+ {
+ return SIMD_4x32(_mm_slli_epi32(m_reg, static_cast<int>(shift)));
+ }
+
+ SIMD_4x32 operator>>(size_t shift) const
+ {
+ return SIMD_4x32(_mm_srli_epi32(m_reg, static_cast<int>(shift)));
+ }
+
+ SIMD_4x32 operator~() const
+ {
+ return SIMD_4x32(_mm_xor_si128(m_reg, _mm_set1_epi32(0xFFFFFFFF)));
+ }
+
+ // (~reg) & other
+ SIMD_4x32 andc(const SIMD_4x32& other)
+ {
+ return SIMD_4x32(_mm_andnot_si128(m_reg, other.m_reg));
+ }
+
+ SIMD_4x32 bswap() const
+ {
+ __m128i T = m_reg;
+
+ T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
+ T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
+
+ return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8),
+ _mm_slli_epi16(T, 8)));
+ }
+
+ static void transpose(SIMD_4x32& B0, SIMD_4x32& B1,
+ SIMD_4x32& B2, SIMD_4x32& B3)
+ {
+ __m128i T0 = _mm_unpacklo_epi32(B0.m_reg, B1.m_reg);
+ __m128i T1 = _mm_unpacklo_epi32(B2.m_reg, B3.m_reg);
+ __m128i T2 = _mm_unpackhi_epi32(B0.m_reg, B1.m_reg);
+ __m128i T3 = _mm_unpackhi_epi32(B2.m_reg, B3.m_reg);
+ B0.m_reg = _mm_unpacklo_epi64(T0, T1);
+ B1.m_reg = _mm_unpackhi_epi64(T0, T1);
+ B2.m_reg = _mm_unpacklo_epi64(T2, T3);
+ B3.m_reg = _mm_unpackhi_epi64(T2, T3);
+ }
+
+private:
+
+ explicit SIMD_4x32(__m128i in) { m_reg = in; }
+
+ __m128i m_reg;
+
+};
+
+typedef SIMD_4x32 SIMD_32;
+
+#define key_xor(round, B0, B1, B2, B3) \
+ do { \
+ B0 ^= SIMD_32(round_key[4*round ]); \
+ B1 ^= SIMD_32(round_key[4*round+1]); \
+ B2 ^= SIMD_32(round_key[4*round+2]); \
+ B3 ^= SIMD_32(round_key[4*round+3]); \
+ } while(0);
+
+/*
+* Serpent's linear transformations
+*/
+#define transform(B0, B1, B2, B3) \
+ do { \
+ B0.rotate_left(13); \
+ B2.rotate_left(3); \
+ B1 ^= B0 ^ B2; \
+ B3 ^= B2 ^ (B0 << 3); \
+ B1.rotate_left(1); \
+ B3.rotate_left(7); \
+ B0 ^= B1 ^ B3; \
+ B2 ^= B3 ^ (B1 << 7); \
+ B0.rotate_left(5); \
+ B2.rotate_left(22); \
+ } while(0);
+
+#define i_transform(B0, B1, B2, B3) \
+ do { \
+ B2.rotate_right(22); \
+ B0.rotate_right(5); \
+ B2 ^= B3 ^ (B1 << 7); \
+ B0 ^= B1 ^ B3; \
+ B3.rotate_right(7); \
+ B1.rotate_right(1); \
+ B3 ^= B2 ^ (B0 << 3); \
+ B1 ^= B0 ^ B2; \
+ B2.rotate_right(3); \
+ B0.rotate_right(13); \
+ } while(0);
+
+
+
+/*
+* SIMD Serpent Encryption of 4 blocks in parallel
+*/
+extern "C" void serpent_simd_encrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key)
+{
+ SIMD_32 B0 = SIMD_32::load_le(in);
+ SIMD_32 B1 = SIMD_32::load_le(in + 16);
+ SIMD_32 B2 = SIMD_32::load_le(in + 32);
+ SIMD_32 B3 = SIMD_32::load_le(in + 48);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ key_xor( 0,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 1,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 2,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 3,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 4,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 5,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 6,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 7,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor( 8,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor( 9,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(10,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(11,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(12,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(13,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(14,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(15,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor(16,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(17,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(18,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(19,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(20,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(21,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(22,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(23,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+
+ key_xor(24,B0,B1,B2,B3); SBoxE1(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(25,B0,B1,B2,B3); SBoxE2(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(26,B0,B1,B2,B3); SBoxE3(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(27,B0,B1,B2,B3); SBoxE4(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(28,B0,B1,B2,B3); SBoxE5(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(29,B0,B1,B2,B3); SBoxE6(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(30,B0,B1,B2,B3); SBoxE7(SIMD_32,B0,B1,B2,B3); transform(B0,B1,B2,B3);
+ key_xor(31,B0,B1,B2,B3); SBoxE8(SIMD_32,B0,B1,B2,B3); key_xor(32,B0,B1,B2,B3);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ B0.store_le(out);
+ B1.store_le(out + 16);
+ B2.store_le(out + 32);
+ B3.store_le(out + 48);
+}
+
+/*
+* SIMD Serpent Decryption of 4 blocks in parallel
+*/
+extern "C" void serpent_simd_decrypt_blocks_4(const unsigned __int8 in[], unsigned __int8 out[], unsigned __int32* round_key)
+{
+ SIMD_32 B0 = SIMD_32::load_le(in);
+ SIMD_32 B1 = SIMD_32::load_le(in + 16);
+ SIMD_32 B2 = SIMD_32::load_le(in + 32);
+ SIMD_32 B3 = SIMD_32::load_le(in + 48);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ key_xor(32,B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(31,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(30,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(29,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(28,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(27,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(26,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor(25,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor(24,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(23,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(22,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(21,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(20,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(19,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(18,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor(17,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor(16,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor(15,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor(14,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor(13,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor(12,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor(11,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor(10,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor( 9,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor( 8,B0,B1,B2,B3);
+
+ i_transform(B0,B1,B2,B3); SBoxD8(SIMD_32,B0,B1,B2,B3); key_xor( 7,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD7(SIMD_32,B0,B1,B2,B3); key_xor( 6,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD6(SIMD_32,B0,B1,B2,B3); key_xor( 5,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD5(SIMD_32,B0,B1,B2,B3); key_xor( 4,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD4(SIMD_32,B0,B1,B2,B3); key_xor( 3,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD3(SIMD_32,B0,B1,B2,B3); key_xor( 2,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD2(SIMD_32,B0,B1,B2,B3); key_xor( 1,B0,B1,B2,B3);
+ i_transform(B0,B1,B2,B3); SBoxD1(SIMD_32,B0,B1,B2,B3); key_xor( 0,B0,B1,B2,B3);
+
+ SIMD_32::transpose(B0, B1, B2, B3);
+
+ B0.store_le(out);
+ B1.store_le(out + 16);
+ B2.store_le(out + 32);
+ B3.store_le(out + 48);
+}
+
+#undef key_xor
+#undef transform
+#undef i_transform
+
+#endif