################################################################
### AES-128 in CTR mode			       	             ###
### bitsliced implementation for Intel Core 2 processors     ###
### requires support of SSE extensions up to SSSE3           ###
### Author: Emilia Käsper				     ###	
### Date: 2009-03-19					     ###
### Public domain        	             		     ###
################################################################

.include "common.s"

#####################
#int action in %edi
#ECRYPT_ctx* ctx in %rsi
#const u8* input in %rdx
#u8* output in %rcx
#u32 msglen in %r8d
#####################
.globl _process_bytes
.globl process_bytes
_process_bytes:
process_bytes:
cmpl	$0, %r8d
jne	.START
ret
.START:
# bitsliced key
leaq (%rsi), %rax

pushq	%rbx
pushq	%rbp
pushq	%r12
pushq	%r13
pushq   %r14
pushq	%r15

#increment total length
movq   1440(%rsi),%r12
add    %r8, %r12
movq	%r12, 1440(%rsi)

#msglen
movl	%r8d, %r12d
#input
movq	%rdx, %rbx
#output
movq	%rcx, %rbp

.ENC_BLOCK:
	movdqa 1408(%rsi), %xmm0
	movdqa %xmm0, %xmm1
	pshufb SWAP32,%xmm1
	movdqa %xmm1, %xmm2
	movdqa %xmm1, %xmm3
	movdqa %xmm1, %xmm4
	movdqa %xmm1, %xmm5
	movdqa %xmm1, %xmm6
	movdqa %xmm1, %xmm7

	paddd RCTRINC1, %xmm1
	paddd RCTRINC2, %xmm2
	paddd RCTRINC3, %xmm3
	paddd RCTRINC4, %xmm4
	paddd RCTRINC5, %xmm5
	paddd RCTRINC6, %xmm6
	paddd RCTRINC7, %xmm7

	pshufb M0,     %xmm0
	pshufb M0SWAP, %xmm1
	pshufb M0SWAP, %xmm2
	pshufb M0SWAP, %xmm3
	pshufb M0SWAP, %xmm4
	pshufb M0SWAP, %xmm5
	pshufb M0SWAP, %xmm6
	pshufb M0SWAP, %xmm7

aes128 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rax


# output in first block > [xmm8, xmm9, xmm12, xmm14, xmm11, xmm15, xmm10, xmm13] < last block

	cmpl  	$128, %r12d
	jb	.PARTIAL
	je	.FULL

	movl	1420(%rsi), %r14d
	bswap	%r14d
	addl 	$8, %r14d
	bswap	%r14d
	movl	%r14d, 1420(%rsi)
	
	pxor	(%rbx), %xmm8
	pxor	16(%rbx), %xmm9
	pxor	32(%rbx), %xmm12
	pxor	48(%rbx), %xmm14
	pxor	64(%rbx), %xmm11
	pxor	80(%rbx), %xmm15
	pxor	96(%rbx), %xmm10
	pxor	112(%rbx), %xmm13
	movdqa	%xmm8,	(%rbp)
	movdqa	%xmm9,	16(%rbp)
	movdqa	%xmm12,	32(%rbp)
	movdqa	%xmm14,	48(%rbp)
	movdqa	%xmm11,	64(%rbp)
	movdqa	%xmm15,	80(%rbp)
	movdqa	%xmm10,	96(%rbp)
	movdqa	%xmm13,	112(%rbp)
	subl	$128, %r12d
	addq	$128, %rbx
	addq	$128, %rbp
	jmp	.ENC_BLOCK	
	
.PARTIAL:
	# add partial bytes #
	movl	%r12d,	%r13d
	shr	$4, 	%r12d

	movl	1420(%rsi), %r14d
	bswap	%r14d
	addl 	%r12d, %r14d
	bswap	%r14d
	movl	%r14d, 1420(%rsi)

	movq	%rsp,	%r15
	subq	$128, %rsp
	and $0xFFFFFFFFFFFFFF00, %rsp
	movdqa	%xmm8, (%rsp)	
	movdqa	%xmm9, 16(%rsp)
	movdqa	%xmm12, 32(%rsp)
	movdqa	%xmm14, 48(%rsp)
	movdqa	%xmm11, 64(%rsp)
	movdqa	%xmm15, 80(%rsp)
	movdqa	%xmm10, 96(%rsp)
	movdqa	%xmm13, 112(%rsp)
.BYTES:
	movb	(%rbx), %al
	xorb	(%rsp), %al
	movb	%al,	(%rbp)
	addq	$1,	%rbx
	addq	$1,	%rbp
	addq	$1,	%rsp
	subl	$1,	%r13d
	cmp	$0, 	%r13d
	jne	.BYTES	
	movq	%r15, %rsp
	jmp	.END
	
.FULL:
	movl	1420(%rsi), %r14d
	bswap	%r14d
	addl 	$8, %r14d
	bswap	%r14d
	movl	%r14d, 1420(%rsi)

	pxor	(%rbx), %xmm8
	pxor	16(%rbx), %xmm9
	pxor	32(%rbx), %xmm12
	pxor	48(%rbx), %xmm14
	pxor	64(%rbx), %xmm11
	pxor	80(%rbx), %xmm15
	pxor	96(%rbx), %xmm10
	pxor	112(%rbx), %xmm13
	movdqa	%xmm8,	(%rbp)
	movdqa	%xmm9,	16(%rbp)
	movdqa	%xmm12,	32(%rbp)
	movdqa	%xmm14,	48(%rbp)
	movdqa	%xmm11,	64(%rbp)
	movdqa	%xmm15,	80(%rbp)
	movdqa	%xmm10,	96(%rbp)
	movdqa	%xmm13,	112(%rbp)


.END:
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbp
	popq	%rbx
	ret
# Author: Emilia Käsper				     	
# Date: 2009-03-19					     
# Public domain 

.include "common.s"

.text
.p2align 5
.globl _ECRYPT_keysetup
.globl  ECRYPT_keysetup
_ECRYPT_keysetup:
ECRYPT_keysetup:
mov %rsp,%r11
and $31,%r11
add $0,%r11
sub %r11,%rsp

bitslicekey0 %rsi, %rdi

keyexp1  %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rdi
keyexp   %xmm0, %xmm1, %xmm4, %xmm6, %xmm3, %xmm7, %xmm2, %xmm5, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, 2, %rdi
keyexp   %xmm0, %xmm1, %xmm3, %xmm2, %xmm6, %xmm5, %xmm4, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm6, 3, %rdi
keyexp   %xmm0, %xmm1, %xmm6, %xmm4, %xmm2, %xmm7, %xmm3, %xmm5, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm3, 4, %rdi
keyexp   %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm3, 5, %rdi
keyexp   %xmm0, %xmm1, %xmm4, %xmm6, %xmm3, %xmm7, %xmm2, %xmm5, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm5, 6, %rdi
keyexp   %xmm0, %xmm1, %xmm3, %xmm2, %xmm6, %xmm5, %xmm4, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm3, 7, %rdi
keyexp   %xmm0, %xmm1, %xmm6, %xmm4, %xmm2, %xmm7, %xmm3, %xmm5, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm7, 8, %rdi
keyexp9  %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rdi
keyexp10 %xmm0, %xmm1, %xmm4, %xmm6, %xmm3, %xmm7, %xmm2, %xmm5, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rdi

add %r11,%rsp

ret
# Author: Emilia Käsper and Peter Schwabe
# Date: 2009-03-19
# Public domain

.data

.globl RCON
.globl ROTB
.globl EXPB0
.globl ONE
.globl BS0
.globl BS1
.globl BS2
.globl CTRINC1
.globl CTRINC2
.globl CTRINC3
.globl CTRINC4
.globl CTRINC5
.globl CTRINC6
.globl CTRINC7
.globl M0
.globl SRM0
.globl SR

.p2align 6
#.align 16

#.section .rodata

RCON: .int 0x00000000, 0x00000000, 0x00000000, 0xffffffff
ROTB: .int 0x0c000000, 0x00000000, 0x04000000, 0x08000000
EXPB0: .int 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f
CTRINC1: .int 0x00000001, 0x00000000, 0x00000000, 0x00000000
CTRINC2: .int 0x00000002, 0x00000000, 0x00000000, 0x00000000
CTRINC3: .int 0x00000003, 0x00000000, 0x00000000, 0x00000000
CTRINC4: .int 0x00000004, 0x00000000, 0x00000000, 0x00000000
CTRINC5: .int 0x00000005, 0x00000000, 0x00000000, 0x00000000
CTRINC6: .int 0x00000006, 0x00000000, 0x00000000, 0x00000000
CTRINC7: .int 0x00000007, 0x00000000, 0x00000000, 0x00000000

BS0: .quad 0x5555555555555555, 0x5555555555555555
BS1: .quad 0x3333333333333333, 0x3333333333333333
BS2: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
ONE: .quad 0xffffffffffffffff, 0xffffffffffffffff
M0:  .quad 0x02060a0e03070b0f, 0x0004080c0105090d
SRM0:	.quad 0x0304090e00050a0f, 0x01060b0c0207080d
SR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b



# Author: Emilia Käsper and Peter Schwabe
# Date: 2009-03-19
# Public domain

.data

.globl BM31
.globl BM30
.globl BM29
.globl BM28

.globl BM27
.globl BM26
.globl BM25
.globl BM24

.globl BM23
.globl BM22
.globl BM21
.globl BM20

.globl BM19
.globl BM18
.globl BM17
.globl BM16

.globl BM15
.globl BM14
.globl BM13
.globl BM12

.globl BM11
.globl BM10
.globl BM09
.globl BM08

.globl BM07
.globl BM06
.globl BM05
.globl BM04

.globl BM03
.globl BM02
.globl BM01
.globl BM00

.globl REVERS

.globl BIT063
.globl BIT064
.globl BIT127
.globl GCMPOL

.globl SWAP32
.globl M0SWAP

.globl RCTRINC1
.globl RCTRINC2
.globl RCTRINC3
.globl RCTRINC4
.globl RCTRINC5
.globl RCTRINC6
.globl RCTRINC7

.p2align 6
#.align 16

#.section .rodata

SWAP32: .int 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
RCTRINC1: .int 0x00000000, 0x00000000, 0x00000000, 0x00000001
RCTRINC2: .int 0x00000000, 0x00000000, 0x00000000, 0x00000002
RCTRINC3: .int 0x00000000, 0x00000000, 0x00000000, 0x00000003
RCTRINC4: .int 0x00000000, 0x00000000, 0x00000000, 0x00000004
RCTRINC5: .int 0x00000000, 0x00000000, 0x00000000, 0x00000005
RCTRINC6: .int 0x00000000, 0x00000000, 0x00000000, 0x00000006
RCTRINC7: .int 0x00000000, 0x00000000, 0x00000000, 0x00000007

REVERS: .quad 0x08090A0B0C0D0E0F, 0x0001020304050607

BIT063: .quad 0x0000000000000000, 0x0000000000000001
BIT064: .quad 0x8000000000000000, 0x0000000000000000
BIT127: .quad 0x0000000000000001, 0x0000000000000000
GCMPOL: .quad 0x0000000000000000, 0xE100000000000000

BM31: .quad 0x0000000100000001, 0x0000000100000001
BM30: .quad 0x0000000200000002, 0x0000000200000002
BM29: .quad 0x0000000400000004, 0x0000000400000004
BM28: .quad 0x0000000800000008, 0x0000000800000008

BM27: .quad 0x0000001000000010, 0x0000001000000010
BM26: .quad 0x0000002000000020, 0x0000002000000020
BM25: .quad 0x0000004000000040, 0x0000004000000040
BM24: .quad 0x0000008000000080, 0x0000008000000080

BM23: .quad 0x0000010000000100, 0x0000010000000100
BM22: .quad 0x0000020000000200, 0x0000020000000200
BM21: .quad 0x0000040000000400, 0x0000040000000400
BM20: .quad 0x0000080000000800, 0x0000080000000800

BM19: .quad 0x0000100000001000, 0x0000100000001000
BM18: .quad 0x0000200000002000, 0x0000200000002000
BM17: .quad 0x0000400000004000, 0x0000400000004000
BM16: .quad 0x0000800000008000, 0x0000800000008000

BM15: .quad 0x0001000000010000, 0x0001000000010000
BM14: .quad 0x0002000000020000, 0x0002000000020000
BM13: .quad 0x0004000000040000, 0x0004000000040000
BM12: .quad 0x0008000000080000, 0x0008000000080000

BM11: .quad 0x0010000000100000, 0x0010000000100000
BM10: .quad 0x0020000000200000, 0x0020000000200000
BM09: .quad 0x0040000000400000, 0x0040000000400000
BM08: .quad 0x0080000000800000, 0x0080000000800000

BM07: .quad 0x0100000001000000, 0x0100000001000000
BM06: .quad 0x0200000002000000, 0x0200000002000000
BM05: .quad 0x0400000004000000, 0x0400000004000000
BM04: .quad 0x0800000008000000, 0x0800000008000000

BM03: .quad 0x1000000010000000, 0x1000000010000000
BM02: .quad 0x2000000020000000, 0x2000000020000000
BM01: .quad 0x4000000040000000, 0x4000000040000000
BM00: .quad 0x8000000080000000, 0x8000000080000000

M0SWAP: .quad 0x0105090d0004080c , 0x03070b0f02060a0e



.include "common-gcm.s"

.text

.globl authenticate

authenticate:

#ctx in %rdi
#output in %rsi
#msglen in %rdx

# Let's get some stack space
mov %rsp,%r11
and $31,%r11
add $96,%r11
sub %r11,%rsp

movq %r11,32(%rsp)
movq %r12,40(%rsp)
movq %r13,48(%rsp)
movq %r14,56(%rsp)
movq %r15,64(%rsp)
movq %rbx,72(%rsp)
movq %rbp,80(%rsp)

movdqa 1424(%rdi), %xmm0

mov	%rdx, %rcx

movq	%rdi, %r15
movq	%rdi, %r14
addq	$1456, %r15

MUL_LOOP:
sub  $16,%rcx
jb	AUTH_PARTIAL

pxor	%xmm15, %xmm15
movdqu 	(%rsi), %xmm3
pshufb	REVERS, %xmm3
pxor	%xmm3, %xmm0

Mul_H %xmm0, %r15, %xmm1, %xmm2, %xmm15

movdqa	%xmm15, %xmm0

addq	$16, %rsi

# restore table address

subq	$496, %r15

jmp	MUL_LOOP

AUTH_PARTIAL:

add  $16,%rcx
cmp  $0,%rcx
je AUTH_END
mov  $0,%r10
leaq 0(%rsp),%r12
mov  %r12,%rdi
movq   %r10,0(%r12)
movq   %r10,8(%r12)
rep movsb
mov  %r12,%rsi
mov  $16,%rcx
jmp MUL_LOOP

AUTH_END:

movdqa 	%xmm0, 1424(%r14)

movq 32(%rsp),%r11
movq 40(%rsp),%r12
movq 48(%rsp),%r13
movq 56(%rsp),%r14
movq 64(%rsp),%r15
movq 72(%rsp),%rbx
movq 80(%rsp),%rbp

add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret




.include "common-gcm.s"


.text

# (%rdi) - ctx
# (%rsi) - h

.globl tablesetup

tablesetup:

pushq 	%r15
pushq	%r14

xorq	%r15, %r15
addq	$128, %r15

movq	%rdi, %r14
addq	$1456,  %r14

movdqa	(%rsi), %xmm1

pshufb	REVERS, %xmm1

movdqa	%xmm1,  (%r14)

#jmp END
LOOP:
addq	$16, %r14
subq	$1, %r15
cmp	$0, %r15
je	END
#pshufb	REVERS, %xmm1
Mul_X	%xmm1, %xmm2, %xmm3
#pshufb	REVERS, %xmm1

movdqa	%xmm1, (%r14)
jmp	LOOP

END:
popq %r14
popq %r15

ret



	.file	"wrapper.c"
	.text
	.p2align 4,,15
.globl ECRYPT_init
	.type	ECRYPT_init, @function
ECRYPT_init:
.LFB539:
	.cfi_startproc
	rep
	ret
	.cfi_endproc
.LFE539:
	.size	ECRYPT_init, .-ECRYPT_init
	.p2align 4,,15
.globl ECRYPT_AE_finalize
	.type	ECRYPT_AE_finalize, @function
ECRYPT_AE_finalize:
.LFB543:
	.cfi_startproc
	leaq	1408(%rdi), %rax
	movq	%rbx, -24(%rsp)
	movq	%rbp, -16(%rsp)
	movq	%r12, -8(%rsp)
	subq	$40, %rsp
	.cfi_def_cfa_offset 48
	movq	%rdi, %rbx
	.cfi_offset 12, -16
	.cfi_offset 6, -24
	.cfi_offset 3, -32
	movq	$0, (%rsp)
	movq	$0, 8(%rsp)
	movq	%rsi, %r12
	movq	%rsp, %rcx
	movq	%rsp, %rdx
	movq	%rdi, %rsi
	movb	$0, 12(%rax)
	movb	$0, 13(%rax)
	movl	$16, %r8d
	movb	$0, 14(%rax)
	movb	$1, 15(%rax)
	xorl	%edi, %edi
	call	process_bytes
	movq	%rsp, %rdx
	movq	%r12, %rsi
	movq	%rbx, %rdi
	call	finalmul
	movq	16(%rsp), %rbx
	movq	24(%rsp), %rbp
	movq	32(%rsp), %r12
	addq	$40, %rsp
	ret
	.cfi_endproc
.LFE543:
	.size	ECRYPT_AE_finalize, .-ECRYPT_AE_finalize
	.p2align 4,,15
.globl ECRYPT_AE_process_bytes
	.type	ECRYPT_AE_process_bytes, @function
ECRYPT_AE_process_bytes:
.LFB542:
	.cfi_startproc
	movq	%rbx, -32(%rsp)
	movq	%rbp, -24(%rsp)
	movq	%rsi, %rbx
	.cfi_offset 6, -32
	.cfi_offset 3, -40
	movq	%r12, -16(%rsp)
	movq	%r13, -8(%rsp)
	subq	$40, %rsp
	.cfi_def_cfa_offset 48
	testl	%edi, %edi
	movq	%rdx, %r12
	.cfi_offset 13, -16
	.cfi_offset 12, -24
	movq	%rcx, %r13
	movl	%r8d, %ebp
	je	.L9
	movl	%r8d, %edx
	movq	%r12, %rsi
	movq	%rbx, %rdi
	call	authenticate
	movl	%ebp, %r8d
	movq	%r13, %rcx
	movq	%r12, %rdx
	movq	%rbx, %rsi
	movq	16(%rsp), %rbp
	movq	8(%rsp), %rbx
	movq	24(%rsp), %r12
	movq	32(%rsp), %r13
	xorl	%edi, %edi
	addq	$40, %rsp
	jmp	process_bytes
	.p2align 4,,10
	.p2align 3
.L9:
	call	process_bytes
	movl	%ebp, %edx
	movq	%r13, %rsi
	movq	%rbx, %rdi
	movq	16(%rsp), %rbp
	movq	8(%rsp), %rbx
	movq	24(%rsp), %r12
	movq	32(%rsp), %r13
	addq	$40, %rsp
	jmp	authenticate
	.cfi_endproc
.LFE542:
	.size	ECRYPT_AE_process_bytes, .-ECRYPT_AE_process_bytes
	.p2align 4,,15
.globl ECRYPT_AE_keysetup
	.type	ECRYPT_AE_keysetup, @function
ECRYPT_AE_keysetup:
.LFB541:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	movl	$12, %ecx
	movl	$16, %edx
	pushq	%rbx
	.cfi_def_cfa_offset 24
	movq	%rdi, %rbx
	.cfi_offset 3, -24
	.cfi_offset 6, -16
	subq	$24, %rsp
	.cfi_def_cfa_offset 48
	movq	$0, (%rsp)
	movq	$0, 8(%rsp)
	movq	$0, 1408(%rdi)
	movq	$0, 1416(%rdi)
	call	ECRYPT_keysetup
	movq	%rsp, %rcx
	movq	%rsp, %rdx
	movq	%rbx, %rsi
	movl	$16, %r8d
	xorl	%edi, %edi
	call	process_bytes
	movq	%rsp, %rsi
	movq	%rbx, %rdi
	call	tablesetup
	addq	$24, %rsp
	popq	%rbx
	popq	%rbp
	ret
	.cfi_endproc
.LFE541:
	.size	ECRYPT_AE_keysetup, .-ECRYPT_AE_keysetup
	.p2align 4,,15
.globl ECRYPT_AE_ivsetup
	.type	ECRYPT_AE_ivsetup, @function
ECRYPT_AE_ivsetup:
.LFB540:
	.cfi_startproc
	movq	(%rsi), %rdx
	leaq	1408(%rdi), %rax
	movq	%rdx, 1408(%rdi)
	movl	8(%rsi), %edx
	movb	$0, 12(%rax)
	movb	$0, 13(%rax)
	movb	$0, 14(%rax)
	movb	$2, 15(%rax)
	movl	%edx, 8(%rax)
	movq	$0, 1424(%rdi)
	movq	$0, 1432(%rdi)
	movq	$0, 1440(%rdi)
	ret
	.cfi_endproc
.LFE540:
	.size	ECRYPT_AE_ivsetup, .-ECRYPT_AE_ivsetup
	.ident	"GCC: (Debian 4.4.4-7) 4.4.4"
	.section	.note.GNU-stack,"",@progbits

.include "common-gcm.s"

.text

.globl finalmul

finalmul:

pushq %r15
pushq %r14
pushq %r13
pushq %r12

#ctx in %rdi
#mac in %rsi
#ey0 in %rdx

movq	%rdi, %r15
addq	$1456, %r15

mov	1440(%rdi), %r12

#encryption of ey0 should not be included in ciphertext length

sub  	$16, %r12
shl	$3, %r12	
movd 	%r12, %xmm14
movdqa	1424(%rdi), %xmm0
pxor	%xmm14, %xmm0

pxor	%xmm15, %xmm15

Mul_H %xmm0, %r15, %xmm1, %xmm2, %xmm15

pshufb	REVERS, %xmm15

movdqu	(%rdx), %xmm0
pxor	%xmm15, %xmm0
movdqu 	%xmm0, (%rsi)


popq %r12
popq %r13
popq %r14
popq %r15
ret


