From fc37cc4a02ed13d1a73b941a9f80975600fd1b99 Mon Sep 17 00:00:00 2001 From: David Foerster Date: Tue, 10 May 2016 20:20:14 +0200 Subject: Normalize all line terminators --- src/Crypto/AesSmall_x86.asm | 2888 +++++++++++++++++++++---------------------- 1 file changed, 1444 insertions(+), 1444 deletions(-) (limited to 'src/Crypto/AesSmall_x86.asm') diff --git a/src/Crypto/AesSmall_x86.asm b/src/Crypto/AesSmall_x86.asm index fe7dc47b..de32fc66 100644 --- a/src/Crypto/AesSmall_x86.asm +++ b/src/Crypto/AesSmall_x86.asm @@ -1,1444 +1,1444 @@ - -; --------------------------------------------------------------------------- -; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. -; -; LICENSE TERMS -; -; The free distribution and use of this software is allowed (with or without -; changes) provided that: -; -; 1. source code distributions include the above copyright notice, this -; list of conditions and the following disclaimer; -; -; 2. binary distributions include the above copyright notice, this list -; of conditions and the following disclaimer in their documentation; -; -; 3. the name of the copyright holder is not used to endorse products -; built using this software without specific written permission. -; -; DISCLAIMER -; -; This software is provided 'as is' with no explicit or implied warranties -; in respect of its properties, including, but not limited to, correctness -; and/or fitness for purpose. -; --------------------------------------------------------------------------- -; Issue 20/12/2007 -; -; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h -; and the same define to be set here as well. If AES_V2C is set this file -; requires the C files aeskey.c and aestab.c for support. - -; An AES implementation for x86 processors using the YASM (or NASM) assembler. -; This is a full assembler implementation covering encryption, decryption and -; key scheduling. It uses 2k bytes of tables but its encryption and decryption -; performance is very close to that obtained using large tables. Key schedule -; expansion is slower for both encryption and decryption but this is likely to -; be offset by the much smaller load that this version places on the processor -; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of -; the design of the AES round function used here. -; -; This code provides the standard AES block size (128 bits, 16 bytes) and the -; three standard AES key sizes (128, 192 and 256 bits). It has the same call -; interface as my C implementation. The ebx, esi, edi and ebp registers are -; preserved across calls but eax, ecx and edx and the artihmetic status flags -; are not. Although this is a full assembler implementation, it can be used -; in conjunction with my C code which provides faster key scheduling using -; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C -; defined. It is also important that the defines below match those used in the -; C code. This code uses the VC++ register saving conentions; if it is used -; with another compiler, conventions for using and saving registers may need -; to be checked (and calling conventions). The YASM command line for the VC++ -; custom build step is: -; -; yasm -Xvc -f win32 -D -o "$(TargetDir)\$(InputName).obj" "$(InputPath)" -; -; For the cryptlib build this is (pcg): -; -; yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm -; -; where is ASM_X86_V2 or ASM_X86_V2C. The calling intefaces are: -; -; AES_RETURN aes_encrypt(const unsigned char in_blk[], -; unsigned char out_blk[], const aes_encrypt_ctx cx[1]); -; -; AES_RETURN aes_decrypt(const unsigned char in_blk[], -; unsigned char out_blk[], const aes_decrypt_ctx cx[1]); -; -; AES_RETURN aes_encrypt_key(const unsigned char key[], -; const aes_encrypt_ctx cx[1]); -; -; AES_RETURN aes_decrypt_key(const unsigned char key[], -; const aes_decrypt_ctx cx[1]); -; -; AES_RETURN aes_encrypt_key(const unsigned char key[], -; unsigned int len, const aes_decrypt_ctx cx[1]); -; -; AES_RETURN aes_decrypt_key(const unsigned char key[], -; unsigned int len, const aes_decrypt_ctx cx[1]); -; -; where is 128, 102 or 256. In the last two calls the length can be in -; either bits or bytes. - -; The DLL interface must use the _stdcall convention in which the number -; of bytes of parameter space is added after an @ to the sutine's name. -; We must also remove our parameters from the stack before return (see -; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. - -; -; Adapted for TrueCrypt: -; - All tables generated at run-time -; - Adapted for 16-bit environment -; - -CPU 386 -USE16 -SEGMENT _TEXT PUBLIC CLASS=CODE USE16 -SEGMENT _DATA PUBLIC CLASS=DATA USE16 - -GROUP DGROUP _TEXT _DATA - -extern _aes_dec_tab ; Aestab.c -extern _aes_enc_tab - -; %define DLL_EXPORT - -; The size of the code can be reduced by using functions for the encryption -; and decryption rounds in place of macro expansion - -%define REDUCE_CODE_SIZE - -; Comment in/out the following lines to obtain the desired subroutines. These -; selections MUST match those in the C header file aes.h - -; %define AES_128 ; define if AES with 128 bit keys is needed -; %define AES_192 ; define if AES with 192 bit keys is needed -%define AES_256 ; define if AES with 256 bit keys is needed -; %define AES_VAR ; define if a variable key size is needed -%define ENCRYPTION ; define if encryption is needed -%define DECRYPTION ; define if decryption is needed -; %define AES_REV_DKS ; define if key decryption schedule is reversed - -%ifndef ASM_X86_V2C -%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed -%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed -%endif - -; The encryption key schedule has the following in memory layout where N is the -; number of rounds (10, 12 or 14): -; -; lo: | input key (round 0) | ; each round is four 32-bit words -; | encryption round 1 | -; | encryption round 2 | -; .... -; | encryption round N-1 | -; hi: | encryption round N | -; -; The decryption key schedule is normally set up so that it has the same -; layout as above by actually reversing the order of the encryption key -; schedule in memory (this happens when AES_REV_DKS is set): -; -; lo: | decryption round 0 | = | encryption round N | -; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] -; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] -; .... .... -; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] -; hi: | decryption round N | = | input key (round 0) | -; -; with rounds except the first and last modified using inv_mix_column() -; But if AES_REV_DKS is NOT set the order of keys is left as it is for -; encryption so that it has to be accessed in reverse when used for -; decryption (although the inverse mix column modifications are done) -; -; lo: | decryption round 0 | = | input key (round 0) | -; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] -; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] -; .... .... -; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] -; hi: | decryption round N | = | encryption round N | -; -; This layout is faster when the assembler key scheduling provided here -; is used. -; -; End of user defines - -%ifdef AES_VAR -%ifndef AES_128 -%define AES_128 -%endif -%ifndef AES_192 -%define AES_192 -%endif -%ifndef AES_256 -%define AES_256 -%endif -%endif - -%ifdef AES_VAR -%define KS_LENGTH 60 -%elifdef AES_256 -%define KS_LENGTH 60 -%elifdef AES_192 -%define KS_LENGTH 52 -%else -%define KS_LENGTH 44 -%endif - -; These macros implement stack based local variables - -%macro save 2 - mov [esp+4*%1],%2 -%endmacro - -%macro restore 2 - mov %1,[esp+4*%2] -%endmacro - -%ifdef REDUCE_CODE_SIZE - %macro mf_call 1 - call %1 - %endmacro -%else - %macro mf_call 1 - %1 - %endmacro -%endif - -; the DLL has to implement the _stdcall calling interface on return -; In this case we have to take our parameters (3 4-byte pointers) -; off the stack - -%define parms 12 - -%macro do_name 1-2 parms -%ifndef DLL_EXPORT - global %1 -%1: -%else - global %1@%2 - export %1@%2 -%1@%2: -%endif -%endmacro - -%macro do_call 1-2 parms -%ifndef DLL_EXPORT - call %1 - add esp,%2 -%else - call %1@%2 -%endif -%endmacro - -%macro do_exit 0-1 parms -%ifdef DLL_EXPORT - ret %1 -%else - ret -%endif -%endmacro - -; finite field multiplies by {02}, {04} and {08} - -%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -; finite field multiplies required in table generation - -%define f3(x) (f2(x) ^ x) -%define f9(x) (f8(x) ^ x) -%define fb(x) (f8(x) ^ f2(x) ^ x) -%define fd(x) (f8(x) ^ f4(x) ^ x) -%define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -%define etab_0(x) [_aes_enc_tab+4+8*x] -%define etab_1(x) [_aes_enc_tab+3+8*x] -%define etab_2(x) [_aes_enc_tab+2+8*x] -%define etab_3(x) [_aes_enc_tab+1+8*x] -%define etab_b(x) byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx -%define etab_w(x) word [_aes_enc_tab+8*x] ; used with movzx for 0x0000xx00 - -%define btab_0(x) [_aes_enc_tab+6+8*x] -%define btab_1(x) [_aes_enc_tab+5+8*x] -%define btab_2(x) [_aes_enc_tab+4+8*x] -%define btab_3(x) [_aes_enc_tab+3+8*x] - -; ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the -; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. -; -; Input: -; -; EAX column[0] -; EBX column[1] -; ECX column[2] -; EDX column[3] -; ESI column key[round][2] -; EDI column key[round][3] -; EBP scratch -; -; Output: -; -; EBP column[0] unkeyed -; EBX column[1] unkeyed -; ESI column[2] keyed -; EDI column[3] keyed -; EAX scratch -; ECX scratch -; EDX scratch - -%macro rnd_fun 2 - - rol ebx,16 - %1 esi, cl, 0, ebp - %1 esi, dh, 1, ebp - %1 esi, bh, 3, ebp - %1 edi, dl, 0, ebp - %1 edi, ah, 1, ebp - %1 edi, bl, 2, ebp - %2 ebp, al, 0, ebp - shr ebx,16 - and eax,0xffff0000 - or eax,ebx - shr edx,16 - %1 ebp, ah, 1, ebx - %1 ebp, dh, 3, ebx - %2 ebx, dl, 2, ebx - %1 ebx, ch, 1, edx - %1 ebx, al, 0, edx - shr eax,16 - shr ecx,16 - %1 ebp, cl, 2, edx - %1 edi, ch, 3, edx - %1 esi, al, 2, edx - %1 ebx, ah, 3, edx - -%endmacro - -; Basic MOV and XOR Operations for normal rounds - -%macro nr_xor 4 - movzx %4,%2 - xor %1,etab_%3(%4) -%endmacro - -%macro nr_mov 4 - movzx %4,%2 - mov %1,etab_%3(%4) -%endmacro - -; Basic MOV and XOR Operations for last round - -%if 1 - - %macro lr_xor 4 - movzx %4,%2 - movzx %4,etab_b(%4) - %if %3 != 0 - shl %4,8*%3 - %endif - xor %1,%4 - %endmacro - - %macro lr_mov 4 - movzx %4,%2 - movzx %1,etab_b(%4) - %if %3 != 0 - shl %1,8*%3 - %endif - %endmacro - -%else ; less effective but worth leaving as an option - - %macro lr_xor 4 - movzx %4,%2 - mov %4,btab_%3(%4) - and %4,0x000000ff << 8 * %3 - xor %1,%4 - %endmacro - - %macro lr_mov 4 - movzx %4,%2 - mov %1,btab_%3(%4) - and %1,0x000000ff << 8 * %3 - %endmacro - -%endif - -; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions - -%ifdef REDUCE_CODE_SIZE - -l3s_col: - movzx ecx,al ; in eax - movzx ecx, etab_b(ecx) ; out eax - xor edx,ecx ; scratch ecx,edx - movzx ecx,ah - movzx ecx, etab_b(ecx) - shl ecx,8 - xor edx,ecx - shr eax,16 - movzx ecx,al - movzx ecx, etab_b(ecx) - shl ecx,16 - xor edx,ecx - movzx ecx,ah - movzx ecx, etab_b(ecx) - shl ecx,24 - xor edx,ecx - mov eax,edx - ret - -%else - -%macro l3s_col 0 - - movzx ecx,al ; in eax - movzx ecx, etab_b(ecx) ; out eax - xor edx,ecx ; scratch ecx,edx - movzx ecx,ah - movzx ecx, etab_b(ecx) - shl ecx,8 - xor edx,ecx - shr eax,16 - movzx ecx,al - movzx ecx, etab_b(ecx) - shl ecx,16 - xor edx,ecx - movzx ecx,ah - movzx ecx, etab_b(ecx) - shl ecx,24 - xor edx,ecx - mov eax,edx - -%endmacro - -%endif - -; offsets to parameters - -in_blk equ 2 ; input byte array address parameter -out_blk equ 4 ; output byte array address parameter -ctx equ 6 ; AES context structure -stk_spc equ 20 ; stack space - -%ifdef ENCRYPTION - -; %define ENCRYPTION_TABLE - -%ifdef REDUCE_CODE_SIZE - -enc_round: - sub sp, 2 - add ebp,16 - save 1,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - rnd_fun nr_xor, nr_mov - - mov eax,ebp - mov ecx,esi - mov edx,edi - restore ebp,1 - xor eax,[ebp] - xor ebx,[ebp+4] - add sp, 2 - ret - -%else - -%macro enc_round 0 - - add ebp,16 - save 0,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - rnd_fun nr_xor, nr_mov - - mov eax,ebp - mov ecx,esi - mov edx,edi - restore ebp,0 - xor eax,[ebp] - xor ebx,[ebp+4] - -%endmacro - -%endif - -%macro enc_last_round 0 - - add ebp,16 - save 0,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - rnd_fun lr_xor, lr_mov - - mov eax,ebp - restore ebp,0 - xor eax,[ebp] - xor ebx,[ebp+4] - -%endmacro - - section _TEXT - -; AES Encryption Subroutine - - do_name _aes_encrypt,12 - - mov ax, sp - movzx esp, ax - - sub esp,stk_spc - mov [esp+16],ebp - mov [esp+12],ebx - mov [esp+ 8],esi - mov [esp+ 4],edi - - movzx esi,word [esp+in_blk+stk_spc] ; input pointer - mov eax,[esi ] - mov ebx,[esi+ 4] - mov ecx,[esi+ 8] - mov edx,[esi+12] - - movzx ebp,word [esp+ctx+stk_spc] ; key pointer - movzx edi,byte [ebp+4*KS_LENGTH] - xor eax,[ebp ] - xor ebx,[ebp+ 4] - xor ecx,[ebp+ 8] - xor edx,[ebp+12] - -; determine the number of rounds - -%ifndef AES_256 - cmp edi,10*16 - je .3 - cmp edi,12*16 - je .2 - cmp edi,14*16 - je .1 - mov eax,-1 - jmp .5 -%endif - -.1: mf_call enc_round - mf_call enc_round -.2: mf_call enc_round - mf_call enc_round -.3: mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - mf_call enc_round - enc_last_round - - movzx edx,word [esp+out_blk+stk_spc] - mov [edx],eax - mov [edx+4],ebx - mov [edx+8],esi - mov [edx+12],edi - xor eax,eax - -.5: mov ebp,[esp+16] - mov ebx,[esp+12] - mov esi,[esp+ 8] - mov edi,[esp+ 4] - add esp,stk_spc - do_exit 12 - -%endif - -%macro f_key 2 - - push ecx - push edx - mov edx,esi - ror eax,8 - mf_call l3s_col - mov esi,eax - pop edx - pop ecx - xor esi,rc_val - - mov [ebp+%1*%2],esi - xor edi,esi - mov [ebp+%1*%2+4],edi - xor ecx,edi - mov [ebp+%1*%2+8],ecx - xor edx,ecx - mov [ebp+%1*%2+12],edx - mov eax,edx - -%if %2 == 24 - -%if %1 < 7 - xor eax,[ebp+%1*%2+16-%2] - mov [ebp+%1*%2+16],eax - xor eax,[ebp+%1*%2+20-%2] - mov [ebp+%1*%2+20],eax -%endif - -%elif %2 == 32 - -%if %1 < 6 - push ecx - push edx - mov edx,[ebp+%1*%2+16-%2] - mf_call l3s_col - pop edx - pop ecx - mov [ebp+%1*%2+16],eax - xor eax,[ebp+%1*%2+20-%2] - mov [ebp+%1*%2+20],eax - xor eax,[ebp+%1*%2+24-%2] - mov [ebp+%1*%2+24],eax - xor eax,[ebp+%1*%2+28-%2] - mov [ebp+%1*%2+28],eax -%endif - -%endif - -%assign rc_val f2(rc_val) - -%endmacro - -%ifdef ENCRYPTION_KEY_SCHEDULE - -%ifdef AES_128 - -%ifndef ENCRYPTION_TABLE -; %define ENCRYPTION_TABLE -%endif - -%assign rc_val 1 - - do_name _aes_encrypt_key128,8 - - push ebp - push ebx - push esi - push edi - - mov ebp,[esp+24] - mov [ebp+4*KS_LENGTH],dword 10*16 - mov ebx,[esp+20] - - mov esi,[ebx] - mov [ebp],esi - mov edi,[ebx+4] - mov [ebp+4],edi - mov ecx,[ebx+8] - mov [ebp+8],ecx - mov edx,[ebx+12] - mov [ebp+12],edx - add ebp,16 - mov eax,edx - - f_key 0,16 ; 11 * 4 = 44 unsigned longs - f_key 1,16 ; 4 + 4 * 10 generated = 44 - f_key 2,16 - f_key 3,16 - f_key 4,16 - f_key 5,16 - f_key 6,16 - f_key 7,16 - f_key 8,16 - f_key 9,16 - - pop edi - pop esi - pop ebx - pop ebp - xor eax,eax - do_exit 8 - -%endif - -%ifdef AES_192 - -%ifndef ENCRYPTION_TABLE -; %define ENCRYPTION_TABLE -%endif - -%assign rc_val 1 - - do_name _aes_encrypt_key192,8 - - push ebp - push ebx - push esi - push edi - - mov ebp,[esp+24] - mov [ebp+4*KS_LENGTH],dword 12 * 16 - mov ebx,[esp+20] - - mov esi,[ebx] - mov [ebp],esi - mov edi,[ebx+4] - mov [ebp+4],edi - mov ecx,[ebx+8] - mov [ebp+8],ecx - mov edx,[ebx+12] - mov [ebp+12],edx - mov eax,[ebx+16] - mov [ebp+16],eax - mov eax,[ebx+20] - mov [ebp+20],eax - add ebp,24 - - f_key 0,24 ; 13 * 4 = 52 unsigned longs - f_key 1,24 ; 6 + 6 * 8 generated = 54 - f_key 2,24 - f_key 3,24 - f_key 4,24 - f_key 5,24 - f_key 6,24 - f_key 7,24 - - pop edi - pop esi - pop ebx - pop ebp - xor eax,eax - do_exit 8 - -%endif - -%ifdef AES_256 - -%ifndef ENCRYPTION_TABLE -; %define ENCRYPTION_TABLE -%endif - -%assign rc_val 1 - - do_name _aes_encrypt_key256,8 - - mov ax, sp - movzx esp, ax - - push ebp - push ebx - push esi - push edi - - movzx ebp, word [esp+20] ; ks - mov [ebp+4*KS_LENGTH],dword 14 * 16 - movzx ebx, word [esp+18] ; key - - mov esi,[ebx] - mov [ebp],esi - mov edi,[ebx+4] - mov [ebp+4],edi - mov ecx,[ebx+8] - mov [ebp+8],ecx - mov edx,[ebx+12] - mov [ebp+12],edx - mov eax,[ebx+16] - mov [ebp+16],eax - mov eax,[ebx+20] - mov [ebp+20],eax - mov eax,[ebx+24] - mov [ebp+24],eax - mov eax,[ebx+28] - mov [ebp+28],eax - add ebp,32 - - f_key 0,32 ; 15 * 4 = 60 unsigned longs - f_key 1,32 ; 8 + 8 * 7 generated = 64 - f_key 2,32 - f_key 3,32 - f_key 4,32 - f_key 5,32 - f_key 6,32 - - pop edi - pop esi - pop ebx - pop ebp - xor eax,eax - do_exit 8 - -%endif - -%ifdef AES_VAR - -%ifndef ENCRYPTION_TABLE -; %define ENCRYPTION_TABLE -%endif - - do_name _aes_encrypt_key,12 - - mov ecx,[esp+4] - mov eax,[esp+8] - mov edx,[esp+12] - push edx - push ecx - - cmp eax,16 - je .1 - cmp eax,128 - je .1 - - cmp eax,24 - je .2 - cmp eax,192 - je .2 - - cmp eax,32 - je .3 - cmp eax,256 - je .3 - mov eax,-1 - add esp,8 - do_exit 12 - -.1: do_call _aes_encrypt_key128,8 - do_exit 12 -.2: do_call _aes_encrypt_key192,8 - do_exit 12 -.3: do_call _aes_encrypt_key256,8 - do_exit 12 - -%endif - -%endif - -%ifdef ENCRYPTION_TABLE - -; S-box data - 256 entries - - section _DATA - -%define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x) - -_aes_enc_tab: - db u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) - db u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) - db u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) - db u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) - db u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) - db u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) - db u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) - db u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) - db u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) - db u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) - db u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) - db u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) - db u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) - db u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) - db u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) - db u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) - db u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) - db u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) - db u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) - db u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) - db u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) - db u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) - db u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) - db u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) - db u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) - db u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) - db u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) - db u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) - db u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) - db u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) - db u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) - db u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) - -%endif - -%ifdef DECRYPTION - -; %define DECRYPTION_TABLE - -%define dtab_0(x) [_aes_dec_tab+ 8*x] -%define dtab_1(x) [_aes_dec_tab+3+8*x] -%define dtab_2(x) [_aes_dec_tab+2+8*x] -%define dtab_3(x) [_aes_dec_tab+1+8*x] -%define dtab_x(x) byte [_aes_dec_tab+7+8*x] - -%macro irn_fun 2 - - rol eax,16 - %1 esi, cl, 0, ebp - %1 esi, bh, 1, ebp - %1 esi, al, 2, ebp - %1 edi, dl, 0, ebp - %1 edi, ch, 1, ebp - %1 edi, ah, 3, ebp - %2 ebp, bl, 0, ebp - shr eax,16 - and ebx,0xffff0000 - or ebx,eax - shr ecx,16 - %1 ebp, bh, 1, eax - %1 ebp, ch, 3, eax - %2 eax, cl, 2, ecx - %1 eax, bl, 0, ecx - %1 eax, dh, 1, ecx - shr ebx,16 - shr edx,16 - %1 esi, dh, 3, ecx - %1 ebp, dl, 2, ecx - %1 eax, bh, 3, ecx - %1 edi, bl, 2, ecx - -%endmacro - -; Basic MOV and XOR Operations for normal rounds - -%macro ni_xor 4 - movzx %4,%2 - xor %1,dtab_%3(%4) -%endmacro - -%macro ni_mov 4 - movzx %4,%2 - mov %1,dtab_%3(%4) -%endmacro - -; Basic MOV and XOR Operations for last round - -%macro li_xor 4 - movzx %4,%2 - movzx %4,dtab_x(%4) -%if %3 != 0 - shl %4,8*%3 -%endif - xor %1,%4 -%endmacro - -%macro li_mov 4 - movzx %4,%2 - movzx %1,dtab_x(%4) -%if %3 != 0 - shl %1,8*%3 -%endif -%endmacro - -%ifdef REDUCE_CODE_SIZE - -dec_round: - sub sp, 2 -%ifdef AES_REV_DKS - add ebp,16 -%else - sub ebp,16 -%endif - save 1,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - irn_fun ni_xor, ni_mov - - mov ebx,ebp - mov ecx,esi - mov edx,edi - restore ebp,1 - xor eax,[ebp] - xor ebx,[ebp+4] - add sp, 2 - ret - -%else - -%macro dec_round 0 - -%ifdef AES_REV_DKS - add ebp,16 -%else - sub ebp,16 -%endif - save 0,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - irn_fun ni_xor, ni_mov - - mov ebx,ebp - mov ecx,esi - mov edx,edi - restore ebp,0 - xor eax,[ebp] - xor ebx,[ebp+4] - -%endmacro - -%endif - -%macro dec_last_round 0 - -%ifdef AES_REV_DKS - add ebp,16 -%else - sub ebp,16 -%endif - save 0,ebp - mov esi,[ebp+8] - mov edi,[ebp+12] - - irn_fun li_xor, li_mov - - mov ebx,ebp - restore ebp,0 - xor eax,[ebp] - xor ebx,[ebp+4] - -%endmacro - - section _TEXT - -; AES Decryption Subroutine - - do_name _aes_decrypt,12 - - mov ax, sp - movzx esp, ax - - sub esp,stk_spc - mov [esp+16],ebp - mov [esp+12],ebx - mov [esp+ 8],esi - mov [esp+ 4],edi - -; input four columns and xor in first round key - - movzx esi,word [esp+in_blk+stk_spc] ; input pointer - mov eax,[esi ] - mov ebx,[esi+ 4] - mov ecx,[esi+ 8] - mov edx,[esi+12] - lea esi,[esi+16] - - movzx ebp, word [esp+ctx+stk_spc] ; key pointer - movzx edi,byte[ebp+4*KS_LENGTH] -%ifndef AES_REV_DKS ; if decryption key schedule is not reversed - lea ebp,[ebp+edi] ; we have to access it from the top down -%endif - xor eax,[ebp ] ; key schedule - xor ebx,[ebp+ 4] - xor ecx,[ebp+ 8] - xor edx,[ebp+12] - -; determine the number of rounds - -%ifndef AES_256 - cmp edi,10*16 - je .3 - cmp edi,12*16 - je .2 - cmp edi,14*16 - je .1 - mov eax,-1 - jmp .5 -%endif - -.1: mf_call dec_round - mf_call dec_round -.2: mf_call dec_round - mf_call dec_round -.3: mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - mf_call dec_round - dec_last_round - -; move final values to the output array. - - movzx ebp,word [esp+out_blk+stk_spc] - mov [ebp],eax - mov [ebp+4],ebx - mov [ebp+8],esi - mov [ebp+12],edi - xor eax,eax - -.5: mov ebp,[esp+16] - mov ebx,[esp+12] - mov esi,[esp+ 8] - mov edi,[esp+ 4] - add esp,stk_spc - do_exit 12 - -%endif - -%ifdef REDUCE_CODE_SIZE - -inv_mix_col: - movzx ecx,dl ; input eax, edx - movzx ecx,etab_b(ecx) ; output eax - mov eax,dtab_0(ecx) ; used ecx - movzx ecx,dh - shr edx,16 - movzx ecx,etab_b(ecx) - xor eax,dtab_1(ecx) - movzx ecx,dl - movzx ecx,etab_b(ecx) - xor eax,dtab_2(ecx) - movzx ecx,dh - movzx ecx,etab_b(ecx) - xor eax,dtab_3(ecx) - ret - -%else - -%macro inv_mix_col 0 - - movzx ecx,dl ; input eax, edx - movzx ecx,etab_b(ecx) ; output eax - mov eax,dtab_0(ecx) ; used ecx - movzx ecx,dh - shr edx,16 - movzx ecx,etab_b(ecx) - xor eax,dtab_1(ecx) - movzx ecx,dl - movzx ecx,etab_b(ecx) - xor eax,dtab_2(ecx) - movzx ecx,dh - movzx ecx,etab_b(ecx) - xor eax,dtab_3(ecx) - -%endmacro - -%endif - -%ifdef DECRYPTION_KEY_SCHEDULE - -%ifdef AES_128 - -%ifndef DECRYPTION_TABLE -; %define DECRYPTION_TABLE -%endif - - do_name _aes_decrypt_key128,8 - - push ebp - push ebx - push esi - push edi - mov eax,[esp+24] ; context - mov edx,[esp+20] ; key - push eax - push edx - do_call _aes_encrypt_key128,8 ; generate expanded encryption key - mov eax,10*16 - mov esi,[esp+24] ; pointer to first round key - lea edi,[esi+eax] ; pointer to last round key - add esi,32 - ; the inverse mix column transformation - mov edx,[esi-16] ; needs to be applied to all round keys - mf_call inv_mix_col ; except first and last. Hence start by - mov [esi-16],eax ; transforming the four sub-keys in the - mov edx,[esi-12] ; second round key - mf_call inv_mix_col - mov [esi-12],eax ; transformations for subsequent rounds - mov edx,[esi-8] ; can then be made more efficient by - mf_call inv_mix_col ; noting that for three of the four sub-keys - mov [esi-8],eax ; in the encryption round key ek[r]: - mov edx,[esi-4] ; - mf_call inv_mix_col ; ek[r][n] = ek[r][n-1] ^ ek[r-1][n] - mov [esi-4],eax ; - ; where n is 1..3. Hence the corresponding -.0: mov edx,[esi] ; subkeys in the decryption round key dk[r] - mf_call inv_mix_col ; also obey since inv_mix_col is linear in - mov [esi],eax ; GF(256): - xor eax,[esi-12] ; - mov [esi+4],eax ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] - xor eax,[esi-8] ; - mov [esi+8],eax ; So we only need one inverse mix column - xor eax,[esi-4] ; operation (n = 0) for each four word cycle - mov [esi+12],eax ; in the expanded key. - add esi,16 - cmp edi,esi - jg .0 - jmp dec_end - -%endif - -%ifdef AES_192 - -%ifndef DECRYPTION_TABLE -; %define DECRYPTION_TABLE -%endif - - do_name _aes_decrypt_key192,8 - - push ebp - push ebx - push esi - push edi - mov eax,[esp+24] ; context - mov edx,[esp+20] ; key - push eax - push edx - do_call _aes_encrypt_key192,8 ; generate expanded encryption key - mov eax,12*16 - mov esi,[esp+24] ; first round key - lea edi,[esi+eax] ; last round key - add esi,48 ; the first 6 words are the key, of - ; which the top 2 words are part of - mov edx,[esi-32] ; the second round key and hence - mf_call inv_mix_col ; need to be modified. After this we - mov [esi-32],eax ; need to do a further six values prior - mov edx,[esi-28] ; to using a more efficient technique - mf_call inv_mix_col ; based on: - mov [esi-28],eax ; - ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] - mov edx,[esi-24] ; - mf_call inv_mix_col ; for n = 1 .. 5 where the key expansion - mov [esi-24],eax ; cycle is now 6 words long - mov edx,[esi-20] - mf_call inv_mix_col - mov [esi-20],eax - mov edx,[esi-16] - mf_call inv_mix_col - mov [esi-16],eax - mov edx,[esi-12] - mf_call inv_mix_col - mov [esi-12],eax - mov edx,[esi-8] - mf_call inv_mix_col - mov [esi-8],eax - mov edx,[esi-4] - mf_call inv_mix_col - mov [esi-4],eax - -.0: mov edx,[esi] ; the expanded key is 13 * 4 = 44 32-bit words - mf_call inv_mix_col ; of which 11 * 4 = 44 have to be modified - mov [esi],eax ; using inv_mix_col. We have already done 8 - xor eax,[esi-20] ; of these so 36 are left - hence we need - mov [esi+4],eax ; exactly 6 loops of six here - xor eax,[esi-16] - mov [esi+8],eax - xor eax,[esi-12] - mov [esi+12],eax - xor eax,[esi-8] - mov [esi+16],eax - xor eax,[esi-4] - mov [esi+20],eax - add esi,24 - cmp edi,esi - jg .0 - jmp dec_end - -%endif - -%ifdef AES_256 - -%ifndef DECRYPTION_TABLE -; %define DECRYPTION_TABLE -%endif - - do_name _aes_decrypt_key256,8 - - mov ax, sp - movzx esp, ax - push ebp - push ebx - push esi - push edi - - movzx eax, word [esp+20] ; ks - movzx edx, word [esp+18] ; key - push ax - push dx - do_call _aes_encrypt_key256,4 ; generate expanded encryption key - mov eax,14*16 - movzx esi, word [esp+20] ; ks - lea edi,[esi+eax] - add esi,64 - - mov edx,[esi-48] ; the primary key is 8 words, of which - mf_call inv_mix_col ; the top four require modification - mov [esi-48],eax - mov edx,[esi-44] - mf_call inv_mix_col - mov [esi-44],eax - mov edx,[esi-40] - mf_call inv_mix_col - mov [esi-40],eax - mov edx,[esi-36] - mf_call inv_mix_col - mov [esi-36],eax - - mov edx,[esi-32] ; the encryption key expansion cycle is - mf_call inv_mix_col ; now eight words long so we need to - mov [esi-32],eax ; start by doing one complete block - mov edx,[esi-28] - mf_call inv_mix_col - mov [esi-28],eax - mov edx,[esi-24] - mf_call inv_mix_col - mov [esi-24],eax - mov edx,[esi-20] - mf_call inv_mix_col - mov [esi-20],eax - mov edx,[esi-16] - mf_call inv_mix_col - mov [esi-16],eax - mov edx,[esi-12] - mf_call inv_mix_col - mov [esi-12],eax - mov edx,[esi-8] - mf_call inv_mix_col - mov [esi-8],eax - mov edx,[esi-4] - mf_call inv_mix_col - mov [esi-4],eax - -.0: mov edx,[esi] ; we can now speed up the remaining - mf_call inv_mix_col ; rounds by using the technique - mov [esi],eax ; outlined earlier. But note that - xor eax,[esi-28] ; there is one extra inverse mix - mov [esi+4],eax ; column operation as the 256 bit - xor eax,[esi-24] ; key has an extra non-linear step - mov [esi+8],eax ; for the midway element. - xor eax,[esi-20] - mov [esi+12],eax ; the expanded key is 15 * 4 = 60 - mov edx,[esi+16] ; 32-bit words of which 52 need to - mf_call inv_mix_col ; be modified. We have already done - mov [esi+16],eax ; 12 so 40 are left - which means - xor eax,[esi-12] ; that we need exactly 5 loops of 8 - mov [esi+20],eax - xor eax,[esi-8] - mov [esi+24],eax - xor eax,[esi-4] - mov [esi+28],eax - add esi,32 - cmp edi,esi - jg .0 - -%endif - -dec_end: - -%ifdef AES_REV_DKS - - movzx esi,word [esp+20] ; this reverses the order of the -.1: mov eax,[esi] ; round keys if required - mov ebx,[esi+4] - mov ebp,[edi] - mov edx,[edi+4] - mov [esi],ebp - mov [esi+4],edx - mov [edi],eax - mov [edi+4],ebx - - mov eax,[esi+8] - mov ebx,[esi+12] - mov ebp,[edi+8] - mov edx,[edi+12] - mov [esi+8],ebp - mov [esi+12],edx - mov [edi+8],eax - mov [edi+12],ebx - - add esi,16 - sub edi,16 - cmp edi,esi - jg .1 - -%endif - - pop edi - pop esi - pop ebx - pop ebp - xor eax,eax - do_exit 8 - -%ifdef AES_VAR - - do_name _aes_decrypt_key,12 - - mov ecx,[esp+4] - mov eax,[esp+8] - mov edx,[esp+12] - push edx - push ecx - - cmp eax,16 - je .1 - cmp eax,128 - je .1 - - cmp eax,24 - je .2 - cmp eax,192 - je .2 - - cmp eax,32 - je .3 - cmp eax,256 - je .3 - mov eax,-1 - add esp,8 - do_exit 12 - -.1: do_call _aes_decrypt_key128,8 - do_exit 12 -.2: do_call _aes_decrypt_key192,8 - do_exit 12 -.3: do_call _aes_decrypt_key256,8 - do_exit 12 - -%endif - -%endif - -%ifdef DECRYPTION_TABLE - -; Inverse S-box data - 256 entries - - section _DATA - -%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x - -_aes_dec_tab: - db v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) - db v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) - db v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) - db v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) - db v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) - db v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) - db v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) - db v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) - db v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) - db v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) - db v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) - db v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) - db v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) - db v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) - db v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) - db v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) - db v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) - db v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) - db v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) - db v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) - db v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) - db v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) - db v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) - db v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) - db v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) - db v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) - db v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) - db v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) - db v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) - db v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) - db v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) - db v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) - -%endif + +; --------------------------------------------------------------------------- +; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. +; +; LICENSE TERMS +; +; The free distribution and use of this software is allowed (with or without +; changes) provided that: +; +; 1. source code distributions include the above copyright notice, this +; list of conditions and the following disclaimer; +; +; 2. binary distributions include the above copyright notice, this list +; of conditions and the following disclaimer in their documentation; +; +; 3. the name of the copyright holder is not used to endorse products +; built using this software without specific written permission. +; +; DISCLAIMER +; +; This software is provided 'as is' with no explicit or implied warranties +; in respect of its properties, including, but not limited to, correctness +; and/or fitness for purpose. +; --------------------------------------------------------------------------- +; Issue 20/12/2007 +; +; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h +; and the same define to be set here as well. If AES_V2C is set this file +; requires the C files aeskey.c and aestab.c for support. + +; An AES implementation for x86 processors using the YASM (or NASM) assembler. +; This is a full assembler implementation covering encryption, decryption and +; key scheduling. It uses 2k bytes of tables but its encryption and decryption +; performance is very close to that obtained using large tables. Key schedule +; expansion is slower for both encryption and decryption but this is likely to +; be offset by the much smaller load that this version places on the processor +; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of +; the design of the AES round function used here. +; +; This code provides the standard AES block size (128 bits, 16 bytes) and the +; three standard AES key sizes (128, 192 and 256 bits). It has the same call +; interface as my C implementation. The ebx, esi, edi and ebp registers are +; preserved across calls but eax, ecx and edx and the artihmetic status flags +; are not. Although this is a full assembler implementation, it can be used +; in conjunction with my C code which provides faster key scheduling using +; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C +; defined. It is also important that the defines below match those used in the +; C code. This code uses the VC++ register saving conentions; if it is used +; with another compiler, conventions for using and saving registers may need +; to be checked (and calling conventions). The YASM command line for the VC++ +; custom build step is: +; +; yasm -Xvc -f win32 -D -o "$(TargetDir)\$(InputName).obj" "$(InputPath)" +; +; For the cryptlib build this is (pcg): +; +; yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm +; +; where is ASM_X86_V2 or ASM_X86_V2C. The calling intefaces are: +; +; AES_RETURN aes_encrypt(const unsigned char in_blk[], +; unsigned char out_blk[], const aes_encrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt(const unsigned char in_blk[], +; unsigned char out_blk[], const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_encrypt_key(const unsigned char key[], +; const aes_encrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt_key(const unsigned char key[], +; const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_encrypt_key(const unsigned char key[], +; unsigned int len, const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt_key(const unsigned char key[], +; unsigned int len, const aes_decrypt_ctx cx[1]); +; +; where is 128, 102 or 256. In the last two calls the length can be in +; either bits or bytes. + +; The DLL interface must use the _stdcall convention in which the number +; of bytes of parameter space is added after an @ to the sutine's name. +; We must also remove our parameters from the stack before return (see +; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. + +; +; Adapted for TrueCrypt: +; - All tables generated at run-time +; - Adapted for 16-bit environment +; + +CPU 386 +USE16 +SEGMENT _TEXT PUBLIC CLASS=CODE USE16 +SEGMENT _DATA PUBLIC CLASS=DATA USE16 + +GROUP DGROUP _TEXT _DATA + +extern _aes_dec_tab ; Aestab.c +extern _aes_enc_tab + +; %define DLL_EXPORT + +; The size of the code can be reduced by using functions for the encryption +; and decryption rounds in place of macro expansion + +%define REDUCE_CODE_SIZE + +; Comment in/out the following lines to obtain the desired subroutines. These +; selections MUST match those in the C header file aes.h + +; %define AES_128 ; define if AES with 128 bit keys is needed +; %define AES_192 ; define if AES with 192 bit keys is needed +%define AES_256 ; define if AES with 256 bit keys is needed +; %define AES_VAR ; define if a variable key size is needed +%define ENCRYPTION ; define if encryption is needed +%define DECRYPTION ; define if decryption is needed +; %define AES_REV_DKS ; define if key decryption schedule is reversed + +%ifndef ASM_X86_V2C +%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed +%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed +%endif + +; The encryption key schedule has the following in memory layout where N is the +; number of rounds (10, 12 or 14): +; +; lo: | input key (round 0) | ; each round is four 32-bit words +; | encryption round 1 | +; | encryption round 2 | +; .... +; | encryption round N-1 | +; hi: | encryption round N | +; +; The decryption key schedule is normally set up so that it has the same +; layout as above by actually reversing the order of the encryption key +; schedule in memory (this happens when AES_REV_DKS is set): +; +; lo: | decryption round 0 | = | encryption round N | +; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] +; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] +; .... .... +; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] +; hi: | decryption round N | = | input key (round 0) | +; +; with rounds except the first and last modified using inv_mix_column() +; But if AES_REV_DKS is NOT set the order of keys is left as it is for +; encryption so that it has to be accessed in reverse when used for +; decryption (although the inverse mix column modifications are done) +; +; lo: | decryption round 0 | = | input key (round 0) | +; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] +; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] +; .... .... +; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] +; hi: | decryption round N | = | encryption round N | +; +; This layout is faster when the assembler key scheduling provided here +; is used. +; +; End of user defines + +%ifdef AES_VAR +%ifndef AES_128 +%define AES_128 +%endif +%ifndef AES_192 +%define AES_192 +%endif +%ifndef AES_256 +%define AES_256 +%endif +%endif + +%ifdef AES_VAR +%define KS_LENGTH 60 +%elifdef AES_256 +%define KS_LENGTH 60 +%elifdef AES_192 +%define KS_LENGTH 52 +%else +%define KS_LENGTH 44 +%endif + +; These macros implement stack based local variables + +%macro save 2 + mov [esp+4*%1],%2 +%endmacro + +%macro restore 2 + mov %1,[esp+4*%2] +%endmacro + +%ifdef REDUCE_CODE_SIZE + %macro mf_call 1 + call %1 + %endmacro +%else + %macro mf_call 1 + %1 + %endmacro +%endif + +; the DLL has to implement the _stdcall calling interface on return +; In this case we have to take our parameters (3 4-byte pointers) +; off the stack + +%define parms 12 + +%macro do_name 1-2 parms +%ifndef DLL_EXPORT + global %1 +%1: +%else + global %1@%2 + export %1@%2 +%1@%2: +%endif +%endmacro + +%macro do_call 1-2 parms +%ifndef DLL_EXPORT + call %1 + add esp,%2 +%else + call %1@%2 +%endif +%endmacro + +%macro do_exit 0-1 parms +%ifdef DLL_EXPORT + ret %1 +%else + ret +%endif +%endmacro + +; finite field multiplies by {02}, {04} and {08} + +%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) + +; finite field multiplies required in table generation + +%define f3(x) (f2(x) ^ x) +%define f9(x) (f8(x) ^ x) +%define fb(x) (f8(x) ^ f2(x) ^ x) +%define fd(x) (f8(x) ^ f4(x) ^ x) +%define fe(x) (f8(x) ^ f4(x) ^ f2(x)) + +%define etab_0(x) [_aes_enc_tab+4+8*x] +%define etab_1(x) [_aes_enc_tab+3+8*x] +%define etab_2(x) [_aes_enc_tab+2+8*x] +%define etab_3(x) [_aes_enc_tab+1+8*x] +%define etab_b(x) byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx +%define etab_w(x) word [_aes_enc_tab+8*x] ; used with movzx for 0x0000xx00 + +%define btab_0(x) [_aes_enc_tab+6+8*x] +%define btab_1(x) [_aes_enc_tab+5+8*x] +%define btab_2(x) [_aes_enc_tab+4+8*x] +%define btab_3(x) [_aes_enc_tab+3+8*x] + +; ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the +; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. +; +; Input: +; +; EAX column[0] +; EBX column[1] +; ECX column[2] +; EDX column[3] +; ESI column key[round][2] +; EDI column key[round][3] +; EBP scratch +; +; Output: +; +; EBP column[0] unkeyed +; EBX column[1] unkeyed +; ESI column[2] keyed +; EDI column[3] keyed +; EAX scratch +; ECX scratch +; EDX scratch + +%macro rnd_fun 2 + + rol ebx,16 + %1 esi, cl, 0, ebp + %1 esi, dh, 1, ebp + %1 esi, bh, 3, ebp + %1 edi, dl, 0, ebp + %1 edi, ah, 1, ebp + %1 edi, bl, 2, ebp + %2 ebp, al, 0, ebp + shr ebx,16 + and eax,0xffff0000 + or eax,ebx + shr edx,16 + %1 ebp, ah, 1, ebx + %1 ebp, dh, 3, ebx + %2 ebx, dl, 2, ebx + %1 ebx, ch, 1, edx + %1 ebx, al, 0, edx + shr eax,16 + shr ecx,16 + %1 ebp, cl, 2, edx + %1 edi, ch, 3, edx + %1 esi, al, 2, edx + %1 ebx, ah, 3, edx + +%endmacro + +; Basic MOV and XOR Operations for normal rounds + +%macro nr_xor 4 + movzx %4,%2 + xor %1,etab_%3(%4) +%endmacro + +%macro nr_mov 4 + movzx %4,%2 + mov %1,etab_%3(%4) +%endmacro + +; Basic MOV and XOR Operations for last round + +%if 1 + + %macro lr_xor 4 + movzx %4,%2 + movzx %4,etab_b(%4) + %if %3 != 0 + shl %4,8*%3 + %endif + xor %1,%4 + %endmacro + + %macro lr_mov 4 + movzx %4,%2 + movzx %1,etab_b(%4) + %if %3 != 0 + shl %1,8*%3 + %endif + %endmacro + +%else ; less effective but worth leaving as an option + + %macro lr_xor 4 + movzx %4,%2 + mov %4,btab_%3(%4) + and %4,0x000000ff << 8 * %3 + xor %1,%4 + %endmacro + + %macro lr_mov 4 + movzx %4,%2 + mov %1,btab_%3(%4) + and %1,0x000000ff << 8 * %3 + %endmacro + +%endif + +; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions + +%ifdef REDUCE_CODE_SIZE + +l3s_col: + movzx ecx,al ; in eax + movzx ecx, etab_b(ecx) ; out eax + xor edx,ecx ; scratch ecx,edx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,8 + xor edx,ecx + shr eax,16 + movzx ecx,al + movzx ecx, etab_b(ecx) + shl ecx,16 + xor edx,ecx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,24 + xor edx,ecx + mov eax,edx + ret + +%else + +%macro l3s_col 0 + + movzx ecx,al ; in eax + movzx ecx, etab_b(ecx) ; out eax + xor edx,ecx ; scratch ecx,edx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,8 + xor edx,ecx + shr eax,16 + movzx ecx,al + movzx ecx, etab_b(ecx) + shl ecx,16 + xor edx,ecx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,24 + xor edx,ecx + mov eax,edx + +%endmacro + +%endif + +; offsets to parameters + +in_blk equ 2 ; input byte array address parameter +out_blk equ 4 ; output byte array address parameter +ctx equ 6 ; AES context structure +stk_spc equ 20 ; stack space + +%ifdef ENCRYPTION + +; %define ENCRYPTION_TABLE + +%ifdef REDUCE_CODE_SIZE + +enc_round: + sub sp, 2 + add ebp,16 + save 1,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun nr_xor, nr_mov + + mov eax,ebp + mov ecx,esi + mov edx,edi + restore ebp,1 + xor eax,[ebp] + xor ebx,[ebp+4] + add sp, 2 + ret + +%else + +%macro enc_round 0 + + add ebp,16 + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun nr_xor, nr_mov + + mov eax,ebp + mov ecx,esi + mov edx,edi + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + +%endif + +%macro enc_last_round 0 + + add ebp,16 + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun lr_xor, lr_mov + + mov eax,ebp + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + + section _TEXT + +; AES Encryption Subroutine + + do_name _aes_encrypt,12 + + mov ax, sp + movzx esp, ax + + sub esp,stk_spc + mov [esp+16],ebp + mov [esp+12],ebx + mov [esp+ 8],esi + mov [esp+ 4],edi + + movzx esi,word [esp+in_blk+stk_spc] ; input pointer + mov eax,[esi ] + mov ebx,[esi+ 4] + mov ecx,[esi+ 8] + mov edx,[esi+12] + + movzx ebp,word [esp+ctx+stk_spc] ; key pointer + movzx edi,byte [ebp+4*KS_LENGTH] + xor eax,[ebp ] + xor ebx,[ebp+ 4] + xor ecx,[ebp+ 8] + xor edx,[ebp+12] + +; determine the number of rounds + +%ifndef AES_256 + cmp edi,10*16 + je .3 + cmp edi,12*16 + je .2 + cmp edi,14*16 + je .1 + mov eax,-1 + jmp .5 +%endif + +.1: mf_call enc_round + mf_call enc_round +.2: mf_call enc_round + mf_call enc_round +.3: mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + enc_last_round + + movzx edx,word [esp+out_blk+stk_spc] + mov [edx],eax + mov [edx+4],ebx + mov [edx+8],esi + mov [edx+12],edi + xor eax,eax + +.5: mov ebp,[esp+16] + mov ebx,[esp+12] + mov esi,[esp+ 8] + mov edi,[esp+ 4] + add esp,stk_spc + do_exit 12 + +%endif + +%macro f_key 2 + + push ecx + push edx + mov edx,esi + ror eax,8 + mf_call l3s_col + mov esi,eax + pop edx + pop ecx + xor esi,rc_val + + mov [ebp+%1*%2],esi + xor edi,esi + mov [ebp+%1*%2+4],edi + xor ecx,edi + mov [ebp+%1*%2+8],ecx + xor edx,ecx + mov [ebp+%1*%2+12],edx + mov eax,edx + +%if %2 == 24 + +%if %1 < 7 + xor eax,[ebp+%1*%2+16-%2] + mov [ebp+%1*%2+16],eax + xor eax,[ebp+%1*%2+20-%2] + mov [ebp+%1*%2+20],eax +%endif + +%elif %2 == 32 + +%if %1 < 6 + push ecx + push edx + mov edx,[ebp+%1*%2+16-%2] + mf_call l3s_col + pop edx + pop ecx + mov [ebp+%1*%2+16],eax + xor eax,[ebp+%1*%2+20-%2] + mov [ebp+%1*%2+20],eax + xor eax,[ebp+%1*%2+24-%2] + mov [ebp+%1*%2+24],eax + xor eax,[ebp+%1*%2+28-%2] + mov [ebp+%1*%2+28],eax +%endif + +%endif + +%assign rc_val f2(rc_val) + +%endmacro + +%ifdef ENCRYPTION_KEY_SCHEDULE + +%ifdef AES_128 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key128,8 + + push ebp + push ebx + push esi + push edi + + mov ebp,[esp+24] + mov [ebp+4*KS_LENGTH],dword 10*16 + mov ebx,[esp+20] + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + add ebp,16 + mov eax,edx + + f_key 0,16 ; 11 * 4 = 44 unsigned longs + f_key 1,16 ; 4 + 4 * 10 generated = 44 + f_key 2,16 + f_key 3,16 + f_key 4,16 + f_key 5,16 + f_key 6,16 + f_key 7,16 + f_key 8,16 + f_key 9,16 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_192 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key192,8 + + push ebp + push ebx + push esi + push edi + + mov ebp,[esp+24] + mov [ebp+4*KS_LENGTH],dword 12 * 16 + mov ebx,[esp+20] + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + mov eax,[ebx+16] + mov [ebp+16],eax + mov eax,[ebx+20] + mov [ebp+20],eax + add ebp,24 + + f_key 0,24 ; 13 * 4 = 52 unsigned longs + f_key 1,24 ; 6 + 6 * 8 generated = 54 + f_key 2,24 + f_key 3,24 + f_key 4,24 + f_key 5,24 + f_key 6,24 + f_key 7,24 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_256 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key256,8 + + mov ax, sp + movzx esp, ax + + push ebp + push ebx + push esi + push edi + + movzx ebp, word [esp+20] ; ks + mov [ebp+4*KS_LENGTH],dword 14 * 16 + movzx ebx, word [esp+18] ; key + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + mov eax,[ebx+16] + mov [ebp+16],eax + mov eax,[ebx+20] + mov [ebp+20],eax + mov eax,[ebx+24] + mov [ebp+24],eax + mov eax,[ebx+28] + mov [ebp+28],eax + add ebp,32 + + f_key 0,32 ; 15 * 4 = 60 unsigned longs + f_key 1,32 ; 8 + 8 * 7 generated = 64 + f_key 2,32 + f_key 3,32 + f_key 4,32 + f_key 5,32 + f_key 6,32 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_VAR + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + + do_name _aes_encrypt_key,12 + + mov ecx,[esp+4] + mov eax,[esp+8] + mov edx,[esp+12] + push edx + push ecx + + cmp eax,16 + je .1 + cmp eax,128 + je .1 + + cmp eax,24 + je .2 + cmp eax,192 + je .2 + + cmp eax,32 + je .3 + cmp eax,256 + je .3 + mov eax,-1 + add esp,8 + do_exit 12 + +.1: do_call _aes_encrypt_key128,8 + do_exit 12 +.2: do_call _aes_encrypt_key192,8 + do_exit 12 +.3: do_call _aes_encrypt_key256,8 + do_exit 12 + +%endif + +%endif + +%ifdef ENCRYPTION_TABLE + +; S-box data - 256 entries + + section _DATA + +%define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x) + +_aes_enc_tab: + db u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) + db u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) + db u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) + db u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) + db u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) + db u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) + db u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) + db u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) + db u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) + db u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) + db u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) + db u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) + db u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) + db u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) + db u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) + db u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) + db u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) + db u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) + db u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) + db u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) + db u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) + db u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) + db u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) + db u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) + db u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) + db u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) + db u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) + db u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) + db u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) + db u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) + db u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) + db u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) + +%endif + +%ifdef DECRYPTION + +; %define DECRYPTION_TABLE + +%define dtab_0(x) [_aes_dec_tab+ 8*x] +%define dtab_1(x) [_aes_dec_tab+3+8*x] +%define dtab_2(x) [_aes_dec_tab+2+8*x] +%define dtab_3(x) [_aes_dec_tab+1+8*x] +%define dtab_x(x) byte [_aes_dec_tab+7+8*x] + +%macro irn_fun 2 + + rol eax,16 + %1 esi, cl, 0, ebp + %1 esi, bh, 1, ebp + %1 esi, al, 2, ebp + %1 edi, dl, 0, ebp + %1 edi, ch, 1, ebp + %1 edi, ah, 3, ebp + %2 ebp, bl, 0, ebp + shr eax,16 + and ebx,0xffff0000 + or ebx,eax + shr ecx,16 + %1 ebp, bh, 1, eax + %1 ebp, ch, 3, eax + %2 eax, cl, 2, ecx + %1 eax, bl, 0, ecx + %1 eax, dh, 1, ecx + shr ebx,16 + shr edx,16 + %1 esi, dh, 3, ecx + %1 ebp, dl, 2, ecx + %1 eax, bh, 3, ecx + %1 edi, bl, 2, ecx + +%endmacro + +; Basic MOV and XOR Operations for normal rounds + +%macro ni_xor 4 + movzx %4,%2 + xor %1,dtab_%3(%4) +%endmacro + +%macro ni_mov 4 + movzx %4,%2 + mov %1,dtab_%3(%4) +%endmacro + +; Basic MOV and XOR Operations for last round + +%macro li_xor 4 + movzx %4,%2 + movzx %4,dtab_x(%4) +%if %3 != 0 + shl %4,8*%3 +%endif + xor %1,%4 +%endmacro + +%macro li_mov 4 + movzx %4,%2 + movzx %1,dtab_x(%4) +%if %3 != 0 + shl %1,8*%3 +%endif +%endmacro + +%ifdef REDUCE_CODE_SIZE + +dec_round: + sub sp, 2 +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 1,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun ni_xor, ni_mov + + mov ebx,ebp + mov ecx,esi + mov edx,edi + restore ebp,1 + xor eax,[ebp] + xor ebx,[ebp+4] + add sp, 2 + ret + +%else + +%macro dec_round 0 + +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun ni_xor, ni_mov + + mov ebx,ebp + mov ecx,esi + mov edx,edi + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + +%endif + +%macro dec_last_round 0 + +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun li_xor, li_mov + + mov ebx,ebp + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + + section _TEXT + +; AES Decryption Subroutine + + do_name _aes_decrypt,12 + + mov ax, sp + movzx esp, ax + + sub esp,stk_spc + mov [esp+16],ebp + mov [esp+12],ebx + mov [esp+ 8],esi + mov [esp+ 4],edi + +; input four columns and xor in first round key + + movzx esi,word [esp+in_blk+stk_spc] ; input pointer + mov eax,[esi ] + mov ebx,[esi+ 4] + mov ecx,[esi+ 8] + mov edx,[esi+12] + lea esi,[esi+16] + + movzx ebp, word [esp+ctx+stk_spc] ; key pointer + movzx edi,byte[ebp+4*KS_LENGTH] +%ifndef AES_REV_DKS ; if decryption key schedule is not reversed + lea ebp,[ebp+edi] ; we have to access it from the top down +%endif + xor eax,[ebp ] ; key schedule + xor ebx,[ebp+ 4] + xor ecx,[ebp+ 8] + xor edx,[ebp+12] + +; determine the number of rounds + +%ifndef AES_256 + cmp edi,10*16 + je .3 + cmp edi,12*16 + je .2 + cmp edi,14*16 + je .1 + mov eax,-1 + jmp .5 +%endif + +.1: mf_call dec_round + mf_call dec_round +.2: mf_call dec_round + mf_call dec_round +.3: mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + dec_last_round + +; move final values to the output array. + + movzx ebp,word [esp+out_blk+stk_spc] + mov [ebp],eax + mov [ebp+4],ebx + mov [ebp+8],esi + mov [ebp+12],edi + xor eax,eax + +.5: mov ebp,[esp+16] + mov ebx,[esp+12] + mov esi,[esp+ 8] + mov edi,[esp+ 4] + add esp,stk_spc + do_exit 12 + +%endif + +%ifdef REDUCE_CODE_SIZE + +inv_mix_col: + movzx ecx,dl ; input eax, edx + movzx ecx,etab_b(ecx) ; output eax + mov eax,dtab_0(ecx) ; used ecx + movzx ecx,dh + shr edx,16 + movzx ecx,etab_b(ecx) + xor eax,dtab_1(ecx) + movzx ecx,dl + movzx ecx,etab_b(ecx) + xor eax,dtab_2(ecx) + movzx ecx,dh + movzx ecx,etab_b(ecx) + xor eax,dtab_3(ecx) + ret + +%else + +%macro inv_mix_col 0 + + movzx ecx,dl ; input eax, edx + movzx ecx,etab_b(ecx) ; output eax + mov eax,dtab_0(ecx) ; used ecx + movzx ecx,dh + shr edx,16 + movzx ecx,etab_b(ecx) + xor eax,dtab_1(ecx) + movzx ecx,dl + movzx ecx,etab_b(ecx) + xor eax,dtab_2(ecx) + movzx ecx,dh + movzx ecx,etab_b(ecx) + xor eax,dtab_3(ecx) + +%endmacro + +%endif + +%ifdef DECRYPTION_KEY_SCHEDULE + +%ifdef AES_128 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key128,8 + + push ebp + push ebx + push esi + push edi + mov eax,[esp+24] ; context + mov edx,[esp+20] ; key + push eax + push edx + do_call _aes_encrypt_key128,8 ; generate expanded encryption key + mov eax,10*16 + mov esi,[esp+24] ; pointer to first round key + lea edi,[esi+eax] ; pointer to last round key + add esi,32 + ; the inverse mix column transformation + mov edx,[esi-16] ; needs to be applied to all round keys + mf_call inv_mix_col ; except first and last. Hence start by + mov [esi-16],eax ; transforming the four sub-keys in the + mov edx,[esi-12] ; second round key + mf_call inv_mix_col + mov [esi-12],eax ; transformations for subsequent rounds + mov edx,[esi-8] ; can then be made more efficient by + mf_call inv_mix_col ; noting that for three of the four sub-keys + mov [esi-8],eax ; in the encryption round key ek[r]: + mov edx,[esi-4] ; + mf_call inv_mix_col ; ek[r][n] = ek[r][n-1] ^ ek[r-1][n] + mov [esi-4],eax ; + ; where n is 1..3. Hence the corresponding +.0: mov edx,[esi] ; subkeys in the decryption round key dk[r] + mf_call inv_mix_col ; also obey since inv_mix_col is linear in + mov [esi],eax ; GF(256): + xor eax,[esi-12] ; + mov [esi+4],eax ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] + xor eax,[esi-8] ; + mov [esi+8],eax ; So we only need one inverse mix column + xor eax,[esi-4] ; operation (n = 0) for each four word cycle + mov [esi+12],eax ; in the expanded key. + add esi,16 + cmp edi,esi + jg .0 + jmp dec_end + +%endif + +%ifdef AES_192 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key192,8 + + push ebp + push ebx + push esi + push edi + mov eax,[esp+24] ; context + mov edx,[esp+20] ; key + push eax + push edx + do_call _aes_encrypt_key192,8 ; generate expanded encryption key + mov eax,12*16 + mov esi,[esp+24] ; first round key + lea edi,[esi+eax] ; last round key + add esi,48 ; the first 6 words are the key, of + ; which the top 2 words are part of + mov edx,[esi-32] ; the second round key and hence + mf_call inv_mix_col ; need to be modified. After this we + mov [esi-32],eax ; need to do a further six values prior + mov edx,[esi-28] ; to using a more efficient technique + mf_call inv_mix_col ; based on: + mov [esi-28],eax ; + ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] + mov edx,[esi-24] ; + mf_call inv_mix_col ; for n = 1 .. 5 where the key expansion + mov [esi-24],eax ; cycle is now 6 words long + mov edx,[esi-20] + mf_call inv_mix_col + mov [esi-20],eax + mov edx,[esi-16] + mf_call inv_mix_col + mov [esi-16],eax + mov edx,[esi-12] + mf_call inv_mix_col + mov [esi-12],eax + mov edx,[esi-8] + mf_call inv_mix_col + mov [esi-8],eax + mov edx,[esi-4] + mf_call inv_mix_col + mov [esi-4],eax + +.0: mov edx,[esi] ; the expanded key is 13 * 4 = 44 32-bit words + mf_call inv_mix_col ; of which 11 * 4 = 44 have to be modified + mov [esi],eax ; using inv_mix_col. We have already done 8 + xor eax,[esi-20] ; of these so 36 are left - hence we need + mov [esi+4],eax ; exactly 6 loops of six here + xor eax,[esi-16] + mov [esi+8],eax + xor eax,[esi-12] + mov [esi+12],eax + xor eax,[esi-8] + mov [esi+16],eax + xor eax,[esi-4] + mov [esi+20],eax + add esi,24 + cmp edi,esi + jg .0 + jmp dec_end + +%endif + +%ifdef AES_256 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key256,8 + + mov ax, sp + movzx esp, ax + push ebp + push ebx + push esi + push edi + + movzx eax, word [esp+20] ; ks + movzx edx, word [esp+18] ; key + push ax + push dx + do_call _aes_encrypt_key256,4 ; generate expanded encryption key + mov eax,14*16 + movzx esi, word [esp+20] ; ks + lea edi,[esi+eax] + add esi,64 + + mov edx,[esi-48] ; the primary key is 8 words, of which + mf_call inv_mix_col ; the top four require modification + mov [esi-48],eax + mov edx,[esi-44] + mf_call inv_mix_col + mov [esi-44],eax + mov edx,[esi-40] + mf_call inv_mix_col + mov [esi-40],eax + mov edx,[esi-36] + mf_call inv_mix_col + mov [esi-36],eax + + mov edx,[esi-32] ; the encryption key expansion cycle is + mf_call inv_mix_col ; now eight words long so we need to + mov [esi-32],eax ; start by doing one complete block + mov edx,[esi-28] + mf_call inv_mix_col + mov [esi-28],eax + mov edx,[esi-24] + mf_call inv_mix_col + mov [esi-24],eax + mov edx,[esi-20] + mf_call inv_mix_col + mov [esi-20],eax + mov edx,[esi-16] + mf_call inv_mix_col + mov [esi-16],eax + mov edx,[esi-12] + mf_call inv_mix_col + mov [esi-12],eax + mov edx,[esi-8] + mf_call inv_mix_col + mov [esi-8],eax + mov edx,[esi-4] + mf_call inv_mix_col + mov [esi-4],eax + +.0: mov edx,[esi] ; we can now speed up the remaining + mf_call inv_mix_col ; rounds by using the technique + mov [esi],eax ; outlined earlier. But note that + xor eax,[esi-28] ; there is one extra inverse mix + mov [esi+4],eax ; column operation as the 256 bit + xor eax,[esi-24] ; key has an extra non-linear step + mov [esi+8],eax ; for the midway element. + xor eax,[esi-20] + mov [esi+12],eax ; the expanded key is 15 * 4 = 60 + mov edx,[esi+16] ; 32-bit words of which 52 need to + mf_call inv_mix_col ; be modified. We have already done + mov [esi+16],eax ; 12 so 40 are left - which means + xor eax,[esi-12] ; that we need exactly 5 loops of 8 + mov [esi+20],eax + xor eax,[esi-8] + mov [esi+24],eax + xor eax,[esi-4] + mov [esi+28],eax + add esi,32 + cmp edi,esi + jg .0 + +%endif + +dec_end: + +%ifdef AES_REV_DKS + + movzx esi,word [esp+20] ; this reverses the order of the +.1: mov eax,[esi] ; round keys if required + mov ebx,[esi+4] + mov ebp,[edi] + mov edx,[edi+4] + mov [esi],ebp + mov [esi+4],edx + mov [edi],eax + mov [edi+4],ebx + + mov eax,[esi+8] + mov ebx,[esi+12] + mov ebp,[edi+8] + mov edx,[edi+12] + mov [esi+8],ebp + mov [esi+12],edx + mov [edi+8],eax + mov [edi+12],ebx + + add esi,16 + sub edi,16 + cmp edi,esi + jg .1 + +%endif + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%ifdef AES_VAR + + do_name _aes_decrypt_key,12 + + mov ecx,[esp+4] + mov eax,[esp+8] + mov edx,[esp+12] + push edx + push ecx + + cmp eax,16 + je .1 + cmp eax,128 + je .1 + + cmp eax,24 + je .2 + cmp eax,192 + je .2 + + cmp eax,32 + je .3 + cmp eax,256 + je .3 + mov eax,-1 + add esp,8 + do_exit 12 + +.1: do_call _aes_decrypt_key128,8 + do_exit 12 +.2: do_call _aes_decrypt_key192,8 + do_exit 12 +.3: do_call _aes_decrypt_key256,8 + do_exit 12 + +%endif + +%endif + +%ifdef DECRYPTION_TABLE + +; Inverse S-box data - 256 entries + + section _DATA + +%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x + +_aes_dec_tab: + db v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) + db v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) + db v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) + db v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) + db v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) + db v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) + db v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) + db v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) + db v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) + db v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) + db v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) + db v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) + db v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) + db v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) + db v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) + db v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) + db v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) + db v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) + db v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) + db v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) + db v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) + db v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) + db v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) + db v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) + db v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) + db v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) + db v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) + db v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) + db v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) + db v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) + db v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) + db v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) + +%endif -- cgit v1.2.3