From fc37cc4a02ed13d1a73b941a9f80975600fd1b99 Mon Sep 17 00:00:00 2001 From: David Foerster Date: Tue, 10 May 2016 20:20:14 +0200 Subject: Normalize all line terminators --- src/Common/GfMul.c | 1536 ++++++++++++++++++++++++++-------------------------- 1 file changed, 768 insertions(+), 768 deletions(-) (limited to 'src/Common/GfMul.c') diff --git a/src/Common/GfMul.c b/src/Common/GfMul.c index e933e56b..9cd74a89 100644 --- a/src/Common/GfMul.c +++ b/src/Common/GfMul.c @@ -1,768 +1,768 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software is allowed (with or without - changes) provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 31/01/2004 - - My thanks to John Viega and David McGrew for their support in developing - this code and to David for testing it on a big-endain system. -*/ - -/* - --------------------------------------------------------------------------- - Portions Copyright (c) 2005 TrueCrypt Developers Association - - Changes: - - - Added multiplication in the finite field GF(2^128) optimized for - cases involving a 64-bit operand. - - - Added multiplication in the finite field GF(2^64). - - - Added MSB-first mode. - - - Added basic test algorithms. - - - Removed GCM. - --------------------------------------------------------------------------- -*/ - -#include -#include -#include "GfMul.h" -#include "Tcdefs.h" -#include "Common/Endian.h" - -/* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to */ -/* enable faster operation by taking advantage of memory aligned values */ -/* NOTE: the BUFFER_ALIGN64 option has not been tested extensively */ - -#define BUFFER_ALIGN32 -#define UNROLL_LOOPS /* define to unroll some loops */ -#define IN_LINES /* define to use inline functions */ - /* in place of macros */ - -#define mode(x) GM_##x - -#if defined(__cplusplus) -extern "C" -{ -#endif - -typedef unsigned __int32 mode(32t); -typedef uint64 mode(64t); - -#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -#endif - -#if BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -#endif - -#ifdef _MSC_VER -#pragma intrinsic(memcpy) -#define in_line __inline -#else -#define in_line -#endif - -#if 0 && defined(_MSC_VER) -#define rotl32 _lrotl -#define rotr32 _lrotr -#else -#define rotl32(x,n) (((x) << n) | ((x) >> (32 - n))) -#define rotr32(x,n) (((x) >> n) | ((x) << (32 - n))) -#endif - -#if !defined(bswap_32) -#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)) -#endif - -#if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) -#define SWAP_BYTES -#else -#undef SWAP_BYTES -#endif - -#if defined(SWAP_BYTES) - -#if defined ( IN_LINES ) - -in_line void bsw_32(void * p, unsigned int n) -{ unsigned int i = n; - while(i--) - ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]); -} - -#else - -#define bsw_32(p,n) \ - { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); } - -#endif - -#else -#define bsw_32(p,n) -#endif - -/* These values are used to detect long word alignment in order */ -/* to speed up some GCM buffer operations. This facility may */ -/* not work on some machines */ - -#define lp08(x) ((unsigned char*)(x)) -#define lp32(x) ((mode(32t)*)(x)) -#define lp64(x) ((mode(64t)*)(x)) - -#define A32_MASK 3 -#define A64_MASK 7 -#define aligned32(x) (!(((mode(32t))(x)) & A32_MASK)) -#define aligned64(x) (!(((mode(32t))(x)) & A64_MASK)) - -#if defined( BUFFER_ALIGN32 ) - -#define ADR_MASK A32_MASK -#define aligned aligned32 -#define lp lp32 -#define lp_inc 4 - -#if defined( IN_LINES ) - -in_line void move_block_aligned( void *p, const void *q) -{ - lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], - lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]; -} - -in_line void move_block_aligned64( void *p, const void *q) -{ - lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1]; -} - -in_line void xor_block_aligned( void *p, const void *q) -{ - lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], - lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]; -} - -in_line void xor_block_aligned64( void *p, const void *q) -{ - lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1]; -} - -#else - -#define move_block_aligned(p,q) \ - lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \ - lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3] - -#define xor_block_aligned(p,q) \ - lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \ - lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3] - -#endif - -#elif defined( BUFFER_ALIGN64 ) - -#define ADR_MASK A64_MASK -#define aligned aligned64 -#define lp lp64 -#define lp_inc 8 - -#define move_block_aligned(p,q) \ - lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1] - -#define xor_block_aligned(p,q) \ - lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1] - -#else -#define aligned(x) 0 -#endif - -#define move_block(p,q) memcpy((p), (q), BLOCK_LEN) - -#define xor_block(p,q) \ - lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \ - lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \ - lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \ - lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \ - lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \ - lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \ - lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \ - lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15] - - -#define gf_dat(q) {\ - q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ - q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ - q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ - q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ - q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ - q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ - q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ - q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ - q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ - q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ - q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ - q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ - q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ - q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ - q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ - q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ - q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ - q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ - q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ - q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ - q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ - q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ - q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ - q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ - q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ - q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ - q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ - q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ - q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ - q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ - q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ - q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) } - -/* given the value i in 0..255 as the byte overflow when a a field */ -/* element in GHASH is multipled by x^8, this function will return */ -/* the values that are generated in the lo 16-bit word of the field */ -/* value by applying the modular polynomial. The values lo_byte and */ -/* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so */ -/* that the values can be assembled into memory as required by a */ -/* suitable definition of this macro operating on the table above */ - -#define xp(i) xp_fun( \ - (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \ - (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \ - (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \ - (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ - (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \ - (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \ - (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \ - (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) ) - -#define xp64(i) xp_fun( \ - (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \ - (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \ - (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \ - (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ - (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \ - (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \ - (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \ - (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) ) - -static mode(32t) gf_poly[2] = { 0, 0xe1000000 }; -static mode(32t) gf_poly64[2] = { 0, 0xd8000000 }; - -/* Multiply of a GF128 field element by x. The field element */ -/* is held in an array of bytes in which field bits 8n..8n + 7 */ -/* are held in byte[n], with lower indexed bits placed in the */ -/* more numerically significant bit positions in bytes. */ - -/* This function multiples a field element x, in the polynomial */ -/* field representation. It uses 32-bit word operations to gain */ -/* speed but compensates for machine endianess and hence works */ -/* correctly on both styles of machine */ - -in_line void mul_x(mode(32t) x[4]) -{ mode(32t) t; - - bsw_32(x, 4); - - /* at this point the filed element bits 0..127 are set out */ - /* as follows in 32-bit words (where the most significant */ - /* (ms) numeric bits are to the left) */ - /* */ - /* x[0] x[1] x[2] x[3] */ - /* ms ls ms ls ms ls ms ls */ - /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ - - t = gf_poly[x[3] & 1]; /* bit 127 of the element */ - x[3] = (x[3] >> 1) | (x[2] << 31); /* shift bits up by one */ - x[2] = (x[2] >> 1) | (x[1] << 31); /* position */ - x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ - x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ - bsw_32(x, 4); -} - -in_line void mul_x64(mode(32t) x[2]) -{ mode(32t) t; - - bsw_32(x, 2); - - /* at this point the filed element bits 0..127 are set out */ - /* as follows in 32-bit words (where the most significant */ - /* (ms) numeric bits are to the left) */ - /* */ - /* x[0] x[1] x[2] x[3] */ - /* ms ls ms ls ms ls ms ls */ - /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ - - t = gf_poly64[x[1] & 1]; /* bit 127 of the element */ - /* shift bits up by one */ - /* position */ - x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ - x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ - bsw_32(x, 2); -} - -/* Multiply of a GF128 field element by x^8 using 32-bit words */ -/* for speed - machine endianess matters here */ - -#if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) - -#define xp_fun(x,y) ((mode(32t))(x)) | (((mode(32t))(y)) << 8) -static const unsigned __int16 gft_le[256] = gf_dat(xp); -static const unsigned __int16 gft_le64[256] = gf_dat(xp64); - -in_line void mul_lex8(mode(32t) x[4]) /* mutiply with long words */ -{ mode(32t) t = (x[3] >> 24); /* in little endian format */ - x[3] = (x[3] << 8) | (x[2] >> 24); - x[2] = (x[2] << 8) | (x[1] >> 24); - x[1] = (x[1] << 8) | (x[0] >> 24); - x[0] = (x[0] << 8) ^ gft_le[t]; -} - -in_line void mul_lex8_64(mode(32t) x[2]) /* mutiply with long words */ -{ mode(32t) t = (x[1] >> 24); /* in little endian format */ - x[1] = (x[1] << 8) | (x[0] >> 24); - x[0] = (x[0] << 8) ^ gft_le64[t]; -} - -#endif - -#if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) - -#undef xp_fun -#define xp_fun(x,y) ((mode(32t))(y)) | (((mode(32t))(x)) << 8) -static const unsigned __int16 gft_be[256] = gf_dat(xp); -static const unsigned __int16 gft_be64[256] = gf_dat(xp64); - -in_line void mul_bex8(mode(32t) x[4]) /* mutiply with long words */ -{ mode(32t) t = (x[3] & 0xff); /* in big endian format */ - x[3] = (x[3] >> 8) | (x[2] << 24); - x[2] = (x[2] >> 8) | (x[1] << 24); - x[1] = (x[1] >> 8) | (x[0] << 24); - x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16); -} - -in_line void mul_bex8_64(mode(32t) x[2]) /* mutiply with long words */ -{ mode(32t) t = (x[1] & 0xff); /* in big endian format */ - x[1] = (x[1] >> 8) | (x[0] << 24); - x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16); -} - -#endif - -/* hence choose the correct version for the machine endianess */ - -#if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN -#define mul_x8 mul_bex8 -#define mul_x8_64 mul_bex8_64 -#else -#define mul_x8 mul_lex8 -#define mul_x8_64 mul_lex8_64 -#endif - -/* different versions of the general gf_mul function are provided */ -/* here. Sadly none are very fast :-( */ - -void GfMul128 (void *a, const void* b) -{ mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2]; - int i; - - move_block_aligned(p[0], b); - bsw_32(p[0], 4); - for(i = 0; i < 7; ++i) - { - p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31); - p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31); - p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31); - p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1]; - } - - memset(r, 0, CBLK_LEN); - for(i = 0; i < 16; ++i) - { - if(i) mul_bex8(r); /* order is always big endian here */ - - if(((unsigned char*)a)[15 - i] & 0x80) - xor_block_aligned(r, p[0]); - if(((unsigned char*)a)[15 - i] & 0x40) - xor_block_aligned(r, p[1]); - if(((unsigned char*)a)[15 - i] & 0x20) - xor_block_aligned(r, p[2]); - if(((unsigned char*)a)[15 - i] & 0x10) - xor_block_aligned(r, p[3]); - if(((unsigned char*)a)[15 - i] & 0x08) - xor_block_aligned(r, p[4]); - if(((unsigned char*)a)[15 - i] & 0x04) - xor_block_aligned(r, p[5]); - if(((unsigned char*)a)[15 - i] & 0x02) - xor_block_aligned(r, p[6]); - if(((unsigned char*)a)[15 - i] & 0x01) - xor_block_aligned(r, p[7]); - } - bsw_32(r, 4); - move_block_aligned(a, r); -} - -#if defined( UNROLL_LOOPS ) - -#define xor_8k(i) \ - xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \ - xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]) - - -void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) -{ unsigned __int32 r[CBLK_LEN >> 2]; - - move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]); - xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]); - xor_8k( 1); xor_8k( 2); xor_8k( 3); - xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7); - xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11); - xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15); - move_block_aligned(a, r); -} - -#else - -void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) -{ unsigned __int32 r[CBLK_LEN >> 2], *p; - int i; - - p = ctx->gf_t8k[0][a[0] & 15]; - memcpy(r, p, CBLK_LEN); - p = ctx->gf_t8k[1][a[0] >> 4]; - xor_block_aligned(r, p); - for(i = 1; i < CBLK_LEN; ++i) - { - xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); - xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]); - } - memcpy(a, r, CBLK_LEN); -} - -#endif - -void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx) -{ int i, j, k; - - memset(ctx->gf_t8k, 0, 32 * 16 * 16); - for(i = 0; i < 2 * CBLK_LEN; ++i) - { - if(i == 0) - { - memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN); - for(j = 4; j > 0; j >>= 1) - { - memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN); - mul_x(ctx->gf_t8k[1][j]); - } - memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN); - mul_x(ctx->gf_t8k[0][8]); - for(j = 4; j > 0; j >>= 1) - { - memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN); - mul_x(ctx->gf_t8k[0][j]); - } - } - else if(i > 1) - for(j = 8; j > 0; j >>= 1) - { - memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN); - mul_x8(ctx->gf_t8k[i][j]); - } - - for(j = 2; j < 16; j += j) - { - mode(32t) *pj = ctx->gf_t8k[i][j]; - mode(32t) *pk = ctx->gf_t8k[i][1]; - mode(32t) *pl = ctx->gf_t8k[i][j + 1]; - - for(k = 1; k < j; ++k) - { - *pl++ = pj[0] ^ *pk++; - *pl++ = pj[1] ^ *pk++; - *pl++ = pj[2] ^ *pk++; - *pl++ = pj[3] ^ *pk++; - } - } - } -} - - -void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx) -{ int i, j, k; - - memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k)); - for(i = 0; i < 2 * CBLK_LEN8; ++i) - { - if(i == 0) - { - memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8); - for(j = 4; j > 0; j >>= 1) - { - memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8); - mul_x64(ctx->gf_t4k[1][j]); - } - memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8); - mul_x64(ctx->gf_t4k[0][8]); - for(j = 4; j > 0; j >>= 1) - { - memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8); - mul_x64(ctx->gf_t4k[0][j]); - } - } - else if(i > 1) - for(j = 8; j > 0; j >>= 1) - { - memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8); - mul_x8_64(ctx->gf_t4k[i][j]); - } - - for(j = 2; j < 16; j += j) - { - mode(32t) *pj = ctx->gf_t4k[i][j]; - mode(32t) *pk = ctx->gf_t4k[i][1]; - mode(32t) *pl = ctx->gf_t4k[i][j + 1]; - - for(k = 1; k < j; ++k) - { - *pl++ = pj[0] ^ *pk++; - *pl++ = pj[1] ^ *pk++; - *pl++ = pj[2] ^ *pk++; - *pl++ = pj[3] ^ *pk++; - } - } - } -} - -static int IsBitSet128 (unsigned int bit, unsigned __int8 *a) -{ - return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8)); -} - -static int IsBitSet64 (unsigned int bit, unsigned __int8 *a) -{ - return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8)); -} - -static void SetBit128 (unsigned int bit, unsigned __int8 *a) -{ - a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8); -} - -static void SetBit64 (unsigned int bit, unsigned __int8 *a) -{ - a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8); -} - -void MirrorBits128 (unsigned __int8 *a) -{ - unsigned __int8 t[128 / 8]; - int i; - memset (t,0,16); - for (i = 0; i < 128; i++) - { - if (IsBitSet128(i, a)) - SetBit128 (127 - i, t); - } - memcpy (a, t, sizeof (t)); - burn (t,sizeof (t)); -} - -void MirrorBits64 (unsigned __int8 *a) -{ - unsigned __int8 t[64 / 8]; - int i; - memset (t,0,8); - for (i = 0; i < 64; i++) - { - if (IsBitSet64(i, a)) - SetBit64 (63 - i, t); - } - memcpy (a, t, sizeof (t)); - burn (t,sizeof (t)); -} - -/* Allocate and initialize speed optimization table - for multiplication by 64-bit operand in MSB-first mode */ -int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx) -{ - GfCtx8k *ctx8k; - unsigned __int8 am[16]; - int i, j; - - ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k)); - if (!ctx8k) - return FALSE; - - memcpy (am, a, 16); - MirrorBits128 (am); - compile_8k_table (am, ctx8k); - - /* Convert 8k LSB-first table to 4k MSB-first */ - for (i = 16; i < 32; i++) - { - for (j = 0; j < 16; j++) - { - int jm = 0; - jm |= (j & 0x1) << 3; - jm |= (j & 0x2) << 1; - jm |= (j & 0x4) >> 1; - jm |= (j & 0x8) >> 3; - - memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16); - MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]); - } - } - - burn (ctx8k ,sizeof (*ctx8k)); - burn (am, sizeof (am)); - TCfree (ctx8k); - return TRUE; -} - - -#define xor_8kt64(i) \ - xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \ - xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4]) - -/* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */ -void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx) -{ - unsigned __int32 r[CBLK_LEN >> 2]; - - move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]); - xor_block_aligned(r, ctx->gf_t128[7*2+1][a[7] >> 4]); - - if (*(unsigned __int16 *)a) - { - xor_8kt64(0); - xor_8kt64(1); - } - if (a[2]) - { - xor_8kt64(2); - } - xor_8kt64(3); - xor_8kt64(4); - xor_8kt64(5); - xor_8kt64(6); - - move_block_aligned(p, r); -} - - - -/* Basic algorithms for testing of optimized algorithms */ - -static void xor128 (uint64 *a, uint64 *b) -{ - *a++ ^= *b++; - *a ^= *b; -} - -static void shl128 (unsigned __int8 *a) -{ - int i, x = 0, xx; - for (i = 15; i >= 0; i--) - { - xx = (a[i] & 0x80) >> 7; - a[i] = (char) ((a[i] << 1) | x); - x = xx; - } -} - -static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p) -{ - int i; - unsigned __int8 la[16]; - memcpy (la, a, 16); - memset (p, 0, 16); - - for (i = 0; i < 128; i++) - { - if (IsBitSet128 (i, b)) - xor128 ((uint64 *)p, (uint64 *)la); - - if (la[0] & 0x80) - { - shl128 (la); - la[15] ^= 0x87; - } - else - { - shl128 (la); - } - } -} - - -BOOL GfMulSelfTest () -{ - BOOL result = TRUE; - unsigned __int8 a[16]; - unsigned __int8 b[16]; - unsigned __int8 p1[16]; - unsigned __int8 p2[16]; - GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx)); - int i, j; - - if (!gfCtx) - return FALSE; - - - /* GF(2^128) */ - for (i = 0; i < 0x100; i++) - { - for (j = 0; j < 16; j++) - { - a[j] = (unsigned __int8) i; - b[j] = j < 8 ? 0 : a[j] ^ 0xff; - } - - GfMul128Basic (a, b, p1); - - Gf128Tab64Init (a, gfCtx); - Gf128MulBy64Tab (b + 8, p2, gfCtx); - - if (memcmp (p1, p2, 16) != 0) - result = FALSE; - } - - TCfree (gfCtx); - return result; -} - -#if defined(__cplusplus) -} -#endif +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software is allowed (with or without + changes) provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 31/01/2004 + + My thanks to John Viega and David McGrew for their support in developing + this code and to David for testing it on a big-endain system. +*/ + +/* + --------------------------------------------------------------------------- + Portions Copyright (c) 2005 TrueCrypt Developers Association + + Changes: + + - Added multiplication in the finite field GF(2^128) optimized for + cases involving a 64-bit operand. + + - Added multiplication in the finite field GF(2^64). + + - Added MSB-first mode. + + - Added basic test algorithms. + + - Removed GCM. + --------------------------------------------------------------------------- +*/ + +#include +#include +#include "GfMul.h" +#include "Tcdefs.h" +#include "Common/Endian.h" + +/* BUFFER_ALIGN32 or BUFFER_ALIGN64 must be defined at this point to */ +/* enable faster operation by taking advantage of memory aligned values */ +/* NOTE: the BUFFER_ALIGN64 option has not been tested extensively */ + +#define BUFFER_ALIGN32 +#define UNROLL_LOOPS /* define to unroll some loops */ +#define IN_LINES /* define to use inline functions */ + /* in place of macros */ + +#define mode(x) GM_##x + +#if defined(__cplusplus) +extern "C" +{ +#endif + +typedef unsigned __int32 mode(32t); +typedef uint64 mode(64t); + +#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ +#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ + +#if BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN +#endif + +#if BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN +#endif + +#ifdef _MSC_VER +#pragma intrinsic(memcpy) +#define in_line __inline +#else +#define in_line +#endif + +#if 0 && defined(_MSC_VER) +#define rotl32 _lrotl +#define rotr32 _lrotr +#else +#define rotl32(x,n) (((x) << n) | ((x) >> (32 - n))) +#define rotr32(x,n) (((x) >> n) | ((x) << (32 - n))) +#endif + +#if !defined(bswap_32) +#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)) +#endif + +#if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) +#define SWAP_BYTES +#else +#undef SWAP_BYTES +#endif + +#if defined(SWAP_BYTES) + +#if defined ( IN_LINES ) + +in_line void bsw_32(void * p, unsigned int n) +{ unsigned int i = n; + while(i--) + ((mode(32t)*)p)[i] = bswap_32(((mode(32t)*)p)[i]); +} + +#else + +#define bsw_32(p,n) \ + { int _i = (n); while(_i--) ((mode(32t)*)p)[_i] = bswap_32(((mode(32t)*)p)[_i]); } + +#endif + +#else +#define bsw_32(p,n) +#endif + +/* These values are used to detect long word alignment in order */ +/* to speed up some GCM buffer operations. This facility may */ +/* not work on some machines */ + +#define lp08(x) ((unsigned char*)(x)) +#define lp32(x) ((mode(32t)*)(x)) +#define lp64(x) ((mode(64t)*)(x)) + +#define A32_MASK 3 +#define A64_MASK 7 +#define aligned32(x) (!(((mode(32t))(x)) & A32_MASK)) +#define aligned64(x) (!(((mode(32t))(x)) & A64_MASK)) + +#if defined( BUFFER_ALIGN32 ) + +#define ADR_MASK A32_MASK +#define aligned aligned32 +#define lp lp32 +#define lp_inc 4 + +#if defined( IN_LINES ) + +in_line void move_block_aligned( void *p, const void *q) +{ + lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], + lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3]; +} + +in_line void move_block_aligned64( void *p, const void *q) +{ + lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1]; +} + +in_line void xor_block_aligned( void *p, const void *q) +{ + lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], + lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3]; +} + +in_line void xor_block_aligned64( void *p, const void *q) +{ + lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1]; +} + +#else + +#define move_block_aligned(p,q) \ + lp32(p)[0] = lp32(q)[0], lp32(p)[1] = lp32(q)[1], \ + lp32(p)[2] = lp32(q)[2], lp32(p)[3] = lp32(q)[3] + +#define xor_block_aligned(p,q) \ + lp32(p)[0] ^= lp32(q)[0], lp32(p)[1] ^= lp32(q)[1], \ + lp32(p)[2] ^= lp32(q)[2], lp32(p)[3] ^= lp32(q)[3] + +#endif + +#elif defined( BUFFER_ALIGN64 ) + +#define ADR_MASK A64_MASK +#define aligned aligned64 +#define lp lp64 +#define lp_inc 8 + +#define move_block_aligned(p,q) \ + lp64(p)[0] = lp64(q)[0], lp64(p)[1] = lp64(q)[1] + +#define xor_block_aligned(p,q) \ + lp64(p)[0] ^= lp64(q)[0], lp64(p)[1] ^= lp64(q)[1] + +#else +#define aligned(x) 0 +#endif + +#define move_block(p,q) memcpy((p), (q), BLOCK_LEN) + +#define xor_block(p,q) \ + lp08(p)[ 0] ^= lp08(q)[ 0], lp08(p)[ 1] ^= lp08(q)[ 1], \ + lp08(p)[ 2] ^= lp08(q)[ 2], lp08(p)[ 3] ^= lp08(q)[ 3], \ + lp08(p)[ 4] ^= lp08(q)[ 4], lp08(p)[ 5] ^= lp08(q)[ 5], \ + lp08(p)[ 6] ^= lp08(q)[ 6], lp08(p)[ 7] ^= lp08(q)[ 7], \ + lp08(p)[ 8] ^= lp08(q)[ 8], lp08(p)[ 9] ^= lp08(q)[ 9], \ + lp08(p)[10] ^= lp08(q)[10], lp08(p)[11] ^= lp08(q)[11], \ + lp08(p)[12] ^= lp08(q)[12], lp08(p)[13] ^= lp08(q)[13], \ + lp08(p)[14] ^= lp08(q)[14], lp08(p)[15] ^= lp08(q)[15] + + +#define gf_dat(q) {\ + q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ + q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ + q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ + q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ + q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ + q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ + q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ + q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ + q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ + q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ + q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ + q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ + q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ + q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ + q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ + q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ + q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ + q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ + q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ + q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ + q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ + q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ + q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ + q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ + q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ + q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ + q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ + q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ + q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ + q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ + q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ + q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) } + +/* given the value i in 0..255 as the byte overflow when a a field */ +/* element in GHASH is multipled by x^8, this function will return */ +/* the values that are generated in the lo 16-bit word of the field */ +/* value by applying the modular polynomial. The values lo_byte and */ +/* hi_byte are returned via the macro xp_fun(lo_byte, hi_byte) so */ +/* that the values can be assembled into memory as required by a */ +/* suitable definition of this macro operating on the table above */ + +#define xp(i) xp_fun( \ + (i & 0x80 ? 0xe1 : 0) ^ (i & 0x40 ? 0x70 : 0) ^ \ + (i & 0x20 ? 0x38 : 0) ^ (i & 0x10 ? 0x1c : 0) ^ \ + (i & 0x08 ? 0x0e : 0) ^ (i & 0x04 ? 0x07 : 0) ^ \ + (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ + (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x80 : 0) ^ \ + (i & 0x20 ? 0x40 : 0) ^ (i & 0x10 ? 0x20 : 0) ^ \ + (i & 0x08 ? 0x10 : 0) ^ (i & 0x04 ? 0x08 : 0) ^ \ + (i & 0x02 ? 0x84 : 0) ^ (i & 0x01 ? 0xc2 : 0) ) + +#define xp64(i) xp_fun( \ + (i & 0x80 ? 0xd8 : 0) ^ (i & 0x40 ? 0x6c : 0) ^ \ + (i & 0x20 ? 0x36 : 0) ^ (i & 0x10 ? 0x1b : 0) ^ \ + (i & 0x08 ? 0x0d : 0) ^ (i & 0x04 ? 0x06 : 0) ^ \ + (i & 0x02 ? 0x03 : 0) ^ (i & 0x01 ? 0x01 : 0), \ + (i & 0x80 ? 0x00 : 0) ^ (i & 0x40 ? 0x00 : 0) ^ \ + (i & 0x20 ? 0x00 : 0) ^ (i & 0x10 ? 0x00 : 0) ^ \ + (i & 0x08 ? 0x80 : 0) ^ (i & 0x04 ? 0xc0 : 0) ^ \ + (i & 0x02 ? 0x60 : 0) ^ (i & 0x01 ? 0xb0 : 0) ) + +static mode(32t) gf_poly[2] = { 0, 0xe1000000 }; +static mode(32t) gf_poly64[2] = { 0, 0xd8000000 }; + +/* Multiply of a GF128 field element by x. The field element */ +/* is held in an array of bytes in which field bits 8n..8n + 7 */ +/* are held in byte[n], with lower indexed bits placed in the */ +/* more numerically significant bit positions in bytes. */ + +/* This function multiples a field element x, in the polynomial */ +/* field representation. It uses 32-bit word operations to gain */ +/* speed but compensates for machine endianess and hence works */ +/* correctly on both styles of machine */ + +in_line void mul_x(mode(32t) x[4]) +{ mode(32t) t; + + bsw_32(x, 4); + + /* at this point the filed element bits 0..127 are set out */ + /* as follows in 32-bit words (where the most significant */ + /* (ms) numeric bits are to the left) */ + /* */ + /* x[0] x[1] x[2] x[3] */ + /* ms ls ms ls ms ls ms ls */ + /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ + + t = gf_poly[x[3] & 1]; /* bit 127 of the element */ + x[3] = (x[3] >> 1) | (x[2] << 31); /* shift bits up by one */ + x[2] = (x[2] >> 1) | (x[1] << 31); /* position */ + x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ + x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ + bsw_32(x, 4); +} + +in_line void mul_x64(mode(32t) x[2]) +{ mode(32t) t; + + bsw_32(x, 2); + + /* at this point the filed element bits 0..127 are set out */ + /* as follows in 32-bit words (where the most significant */ + /* (ms) numeric bits are to the left) */ + /* */ + /* x[0] x[1] x[2] x[3] */ + /* ms ls ms ls ms ls ms ls */ + /* field: 0 ... 31 32 .. 63 64 .. 95 96 .. 127 */ + + t = gf_poly64[x[1] & 1]; /* bit 127 of the element */ + /* shift bits up by one */ + /* position */ + x[1] = (x[1] >> 1) | (x[0] << 31); /* if bit 7 is 1 xor in */ + x[0] = (x[0] >> 1) ^ t; /* the field polynomial */ + bsw_32(x, 2); +} + +/* Multiply of a GF128 field element by x^8 using 32-bit words */ +/* for speed - machine endianess matters here */ + +#if (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) + +#define xp_fun(x,y) ((mode(32t))(x)) | (((mode(32t))(y)) << 8) +static const unsigned __int16 gft_le[256] = gf_dat(xp); +static const unsigned __int16 gft_le64[256] = gf_dat(xp64); + +in_line void mul_lex8(mode(32t) x[4]) /* mutiply with long words */ +{ mode(32t) t = (x[3] >> 24); /* in little endian format */ + x[3] = (x[3] << 8) | (x[2] >> 24); + x[2] = (x[2] << 8) | (x[1] >> 24); + x[1] = (x[1] << 8) | (x[0] >> 24); + x[0] = (x[0] << 8) ^ gft_le[t]; +} + +in_line void mul_lex8_64(mode(32t) x[2]) /* mutiply with long words */ +{ mode(32t) t = (x[1] >> 24); /* in little endian format */ + x[1] = (x[1] << 8) | (x[0] >> 24); + x[0] = (x[0] << 8) ^ gft_le64[t]; +} + +#endif + +#if 1 || (PLATFORM_BYTE_ORDER == BRG_LITTLE_ENDIAN) + +#undef xp_fun +#define xp_fun(x,y) ((mode(32t))(y)) | (((mode(32t))(x)) << 8) +static const unsigned __int16 gft_be[256] = gf_dat(xp); +static const unsigned __int16 gft_be64[256] = gf_dat(xp64); + +in_line void mul_bex8(mode(32t) x[4]) /* mutiply with long words */ +{ mode(32t) t = (x[3] & 0xff); /* in big endian format */ + x[3] = (x[3] >> 8) | (x[2] << 24); + x[2] = (x[2] >> 8) | (x[1] << 24); + x[1] = (x[1] >> 8) | (x[0] << 24); + x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be[t]) << 16); +} + +in_line void mul_bex8_64(mode(32t) x[2]) /* mutiply with long words */ +{ mode(32t) t = (x[1] & 0xff); /* in big endian format */ + x[1] = (x[1] >> 8) | (x[0] << 24); + x[0] = (x[0] >> 8) ^ (((mode(32t))gft_be64[t]) << 16); +} + +#endif + +/* hence choose the correct version for the machine endianess */ + +#if PLATFORM_BYTE_ORDER == BRG_BIG_ENDIAN +#define mul_x8 mul_bex8 +#define mul_x8_64 mul_bex8_64 +#else +#define mul_x8 mul_lex8 +#define mul_x8_64 mul_lex8_64 +#endif + +/* different versions of the general gf_mul function are provided */ +/* here. Sadly none are very fast :-( */ + +void GfMul128 (void *a, const void* b) +{ mode(32t) r[CBLK_LEN >> 2], p[8][CBLK_LEN >> 2]; + int i; + + move_block_aligned(p[0], b); + bsw_32(p[0], 4); + for(i = 0; i < 7; ++i) + { + p[i + 1][3] = (p[i][3] >> 1) | (p[i][2] << 31); + p[i + 1][2] = (p[i][2] >> 1) | (p[i][1] << 31); + p[i + 1][1] = (p[i][1] >> 1) | (p[i][0] << 31); + p[i + 1][0] = (p[i][0] >> 1) ^ gf_poly[p[i][3] & 1]; + } + + memset(r, 0, CBLK_LEN); + for(i = 0; i < 16; ++i) + { + if(i) mul_bex8(r); /* order is always big endian here */ + + if(((unsigned char*)a)[15 - i] & 0x80) + xor_block_aligned(r, p[0]); + if(((unsigned char*)a)[15 - i] & 0x40) + xor_block_aligned(r, p[1]); + if(((unsigned char*)a)[15 - i] & 0x20) + xor_block_aligned(r, p[2]); + if(((unsigned char*)a)[15 - i] & 0x10) + xor_block_aligned(r, p[3]); + if(((unsigned char*)a)[15 - i] & 0x08) + xor_block_aligned(r, p[4]); + if(((unsigned char*)a)[15 - i] & 0x04) + xor_block_aligned(r, p[5]); + if(((unsigned char*)a)[15 - i] & 0x02) + xor_block_aligned(r, p[6]); + if(((unsigned char*)a)[15 - i] & 0x01) + xor_block_aligned(r, p[7]); + } + bsw_32(r, 4); + move_block_aligned(a, r); +} + +#if defined( UNROLL_LOOPS ) + +#define xor_8k(i) \ + xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); \ + xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]) + + +void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) +{ unsigned __int32 r[CBLK_LEN >> 2]; + + move_block_aligned(r, ctx->gf_t8k[0][a[0] & 15]); + xor_block_aligned(r, ctx->gf_t8k[1][a[0] >> 4]); + xor_8k( 1); xor_8k( 2); xor_8k( 3); + xor_8k( 4); xor_8k( 5); xor_8k( 6); xor_8k( 7); + xor_8k( 8); xor_8k( 9); xor_8k(10); xor_8k(11); + xor_8k(12); xor_8k(13); xor_8k(14); xor_8k(15); + move_block_aligned(a, r); +} + +#else + +void GfMul128Tab (unsigned char a[CBLK_LEN], GfCtx8k *ctx) +{ unsigned __int32 r[CBLK_LEN >> 2], *p; + int i; + + p = ctx->gf_t8k[0][a[0] & 15]; + memcpy(r, p, CBLK_LEN); + p = ctx->gf_t8k[1][a[0] >> 4]; + xor_block_aligned(r, p); + for(i = 1; i < CBLK_LEN; ++i) + { + xor_block_aligned(r, ctx->gf_t8k[i + i][a[i] & 15]); + xor_block_aligned(r, ctx->gf_t8k[i + i + 1][a[i] >> 4]); + } + memcpy(a, r, CBLK_LEN); +} + +#endif + +void compile_8k_table(unsigned __int8 *a, GfCtx8k *ctx) +{ int i, j, k; + + memset(ctx->gf_t8k, 0, 32 * 16 * 16); + for(i = 0; i < 2 * CBLK_LEN; ++i) + { + if(i == 0) + { + memcpy(ctx->gf_t8k[1][8], a, CBLK_LEN); + for(j = 4; j > 0; j >>= 1) + { + memcpy(ctx->gf_t8k[1][j], ctx->gf_t8k[1][j + j], CBLK_LEN); + mul_x(ctx->gf_t8k[1][j]); + } + memcpy(ctx->gf_t8k[0][8], ctx->gf_t8k[1][1], CBLK_LEN); + mul_x(ctx->gf_t8k[0][8]); + for(j = 4; j > 0; j >>= 1) + { + memcpy(ctx->gf_t8k[0][j], ctx->gf_t8k[0][j + j], CBLK_LEN); + mul_x(ctx->gf_t8k[0][j]); + } + } + else if(i > 1) + for(j = 8; j > 0; j >>= 1) + { + memcpy(ctx->gf_t8k[i][j], ctx->gf_t8k[i - 2][j], CBLK_LEN); + mul_x8(ctx->gf_t8k[i][j]); + } + + for(j = 2; j < 16; j += j) + { + mode(32t) *pj = ctx->gf_t8k[i][j]; + mode(32t) *pk = ctx->gf_t8k[i][1]; + mode(32t) *pl = ctx->gf_t8k[i][j + 1]; + + for(k = 1; k < j; ++k) + { + *pl++ = pj[0] ^ *pk++; + *pl++ = pj[1] ^ *pk++; + *pl++ = pj[2] ^ *pk++; + *pl++ = pj[3] ^ *pk++; + } + } + } +} + + +void compile_4k_table64(unsigned __int8 *a, GfCtx4k64 *ctx) +{ int i, j, k; + + memset(ctx->gf_t4k, 0, sizeof(ctx->gf_t4k)); + for(i = 0; i < 2 * CBLK_LEN8; ++i) + { + if(i == 0) + { + memcpy(ctx->gf_t4k[1][8], a, CBLK_LEN8); + for(j = 4; j > 0; j >>= 1) + { + memcpy(ctx->gf_t4k[1][j], ctx->gf_t4k[1][j + j], CBLK_LEN8); + mul_x64(ctx->gf_t4k[1][j]); + } + memcpy(ctx->gf_t4k[0][8], ctx->gf_t4k[1][1], CBLK_LEN8); + mul_x64(ctx->gf_t4k[0][8]); + for(j = 4; j > 0; j >>= 1) + { + memcpy(ctx->gf_t4k[0][j], ctx->gf_t4k[0][j + j], CBLK_LEN8); + mul_x64(ctx->gf_t4k[0][j]); + } + } + else if(i > 1) + for(j = 8; j > 0; j >>= 1) + { + memcpy(ctx->gf_t4k[i][j], ctx->gf_t4k[i - 2][j], CBLK_LEN8); + mul_x8_64(ctx->gf_t4k[i][j]); + } + + for(j = 2; j < 16; j += j) + { + mode(32t) *pj = ctx->gf_t4k[i][j]; + mode(32t) *pk = ctx->gf_t4k[i][1]; + mode(32t) *pl = ctx->gf_t4k[i][j + 1]; + + for(k = 1; k < j; ++k) + { + *pl++ = pj[0] ^ *pk++; + *pl++ = pj[1] ^ *pk++; + *pl++ = pj[2] ^ *pk++; + *pl++ = pj[3] ^ *pk++; + } + } + } +} + +static int IsBitSet128 (unsigned int bit, unsigned __int8 *a) +{ + return a[(127 - bit) / 8] & (0x80 >> ((127 - bit) % 8)); +} + +static int IsBitSet64 (unsigned int bit, unsigned __int8 *a) +{ + return a[(63 - bit) / 8] & (0x80 >> ((63 - bit) % 8)); +} + +static void SetBit128 (unsigned int bit, unsigned __int8 *a) +{ + a[(127 - bit) / 8] |= 0x80 >> ((127 - bit) % 8); +} + +static void SetBit64 (unsigned int bit, unsigned __int8 *a) +{ + a[(63 - bit) / 8] |= 0x80 >> ((63 - bit) % 8); +} + +void MirrorBits128 (unsigned __int8 *a) +{ + unsigned __int8 t[128 / 8]; + int i; + memset (t,0,16); + for (i = 0; i < 128; i++) + { + if (IsBitSet128(i, a)) + SetBit128 (127 - i, t); + } + memcpy (a, t, sizeof (t)); + burn (t,sizeof (t)); +} + +void MirrorBits64 (unsigned __int8 *a) +{ + unsigned __int8 t[64 / 8]; + int i; + memset (t,0,8); + for (i = 0; i < 64; i++) + { + if (IsBitSet64(i, a)) + SetBit64 (63 - i, t); + } + memcpy (a, t, sizeof (t)); + burn (t,sizeof (t)); +} + +/* Allocate and initialize speed optimization table + for multiplication by 64-bit operand in MSB-first mode */ +int Gf128Tab64Init (unsigned __int8 *a, GfCtx *ctx) +{ + GfCtx8k *ctx8k; + unsigned __int8 am[16]; + int i, j; + + ctx8k = (GfCtx8k *) TCalloc (sizeof (GfCtx8k)); + if (!ctx8k) + return FALSE; + + memcpy (am, a, 16); + MirrorBits128 (am); + compile_8k_table (am, ctx8k); + + /* Convert 8k LSB-first table to 4k MSB-first */ + for (i = 16; i < 32; i++) + { + for (j = 0; j < 16; j++) + { + int jm = 0; + jm |= (j & 0x1) << 3; + jm |= (j & 0x2) << 1; + jm |= (j & 0x4) >> 1; + jm |= (j & 0x8) >> 3; + + memcpy (&ctx->gf_t128[i-16][jm], (unsigned char *)&ctx8k->gf_t8k[31-i][j], 16); + MirrorBits128 ((unsigned char *)&ctx->gf_t128[i-16][jm]); + } + } + + burn (ctx8k ,sizeof (*ctx8k)); + burn (am, sizeof (am)); + TCfree (ctx8k); + return TRUE; +} + + +#define xor_8kt64(i) \ + xor_block_aligned(r, ctx->gf_t128[i + i][a[i] & 15]); \ + xor_block_aligned(r, ctx->gf_t128[i + i + 1][a[i] >> 4]) + +/* Multiply a 128-bit number by a 64-bit number in the finite field GF(2^128) */ +void Gf128MulBy64Tab (unsigned __int8 a[8], unsigned __int8 p[16], GfCtx *ctx) +{ + unsigned __int32 r[CBLK_LEN >> 2]; + + move_block_aligned(r, ctx->gf_t128[7*2][a[7] & 15]); + xor_block_aligned(r, ctx->gf_t128[7*2+1][a[7] >> 4]); + + if (*(unsigned __int16 *)a) + { + xor_8kt64(0); + xor_8kt64(1); + } + if (a[2]) + { + xor_8kt64(2); + } + xor_8kt64(3); + xor_8kt64(4); + xor_8kt64(5); + xor_8kt64(6); + + move_block_aligned(p, r); +} + + + +/* Basic algorithms for testing of optimized algorithms */ + +static void xor128 (uint64 *a, uint64 *b) +{ + *a++ ^= *b++; + *a ^= *b; +} + +static void shl128 (unsigned __int8 *a) +{ + int i, x = 0, xx; + for (i = 15; i >= 0; i--) + { + xx = (a[i] & 0x80) >> 7; + a[i] = (char) ((a[i] << 1) | x); + x = xx; + } +} + +static void GfMul128Basic (unsigned __int8 *a, unsigned __int8 *b, unsigned __int8 *p) +{ + int i; + unsigned __int8 la[16]; + memcpy (la, a, 16); + memset (p, 0, 16); + + for (i = 0; i < 128; i++) + { + if (IsBitSet128 (i, b)) + xor128 ((uint64 *)p, (uint64 *)la); + + if (la[0] & 0x80) + { + shl128 (la); + la[15] ^= 0x87; + } + else + { + shl128 (la); + } + } +} + + +BOOL GfMulSelfTest () +{ + BOOL result = TRUE; + unsigned __int8 a[16]; + unsigned __int8 b[16]; + unsigned __int8 p1[16]; + unsigned __int8 p2[16]; + GfCtx *gfCtx = (GfCtx *) TCalloc (sizeof (GfCtx)); + int i, j; + + if (!gfCtx) + return FALSE; + + + /* GF(2^128) */ + for (i = 0; i < 0x100; i++) + { + for (j = 0; j < 16; j++) + { + a[j] = (unsigned __int8) i; + b[j] = j < 8 ? 0 : a[j] ^ 0xff; + } + + GfMul128Basic (a, b, p1); + + Gf128Tab64Init (a, gfCtx); + Gf128MulBy64Tab (b + 8, p2, gfCtx); + + if (memcmp (p1, p2, 16) != 0) + result = FALSE; + } + + TCfree (gfCtx); + return result; +} + +#if defined(__cplusplus) +} +#endif -- cgit v1.2.3