#if !(USE_OPENSSL_AES || USE_OPENSSL_AES_NI || USE_REFERENCE_AES ||           \
      USE_AES_NI || USE_VIA_ACE_AES || USE_KASPER_AES)
#define USE_OPENSSL_AES            0         /* http://openssl.org         */
#define USE_OPENSSL_AES_NI         0         /* http://openssl.org         */
#define USE_REFERENCE_AES          0         /* Google: rijndael-alg-fst.c */
#define USE_AES_NI                 1         /* Uses compiler's intrinsics */
#define USE_VIA_ACE_AES            0
#define USE_KASPER_AES             0
#endif

/* MAX_KEY_BYTES specifies the maximum size key you intend to supply OCB, and
/  *must* be 16, 24, or 32. In *some* AES implementations it is possible to
/  limit internal key-schedule sizes, so keep this as small as possible.   */
#define MAX_KEY_BYTES             16

/* To eliminate the use of vector types, set the following non-zero        */
#define VECTORS_OFF                0

/* ----------------------------------------------------------------------- */
/* Derived configuration options - Adjust as needed                        */
/* ----------------------------------------------------------------------- */

/* These determine whether vectors should be used.                         */
#define USE_SSE2    ((__SSE2__ || (_M_IX86_FP>=2) || _M_X64) && !VECTORS_OFF)
#define USE_ALTIVEC (__ALTIVEC__ && !VECTORS_OFF)

/* ----------------------------------------------------------------------- */
/* Includes and compiler specific definitions                              */
/* ----------------------------------------------------------------------- */

#include "ae.h"
#include <stdlib.h>
#include <string.h>

/* Define standard sized integers                                          */
#if defined(_MSC_VER) && (_MSC_VER < 1600)
	typedef unsigned __int8  uint8_t;
	typedef unsigned __int32 uint32_t;
	typedef unsigned __int64 uint64_t;
	typedef          __int64 int64_t;
#else
	#include <stdint.h>
#endif

/* How to force specific alignment, request inline, restrict pointers      */
#if __GNUC__
	#define ALIGN(n) __attribute__ ((aligned(n)))
	#define inline __inline__
	#define restrict __restrict__
#elif _MSC_VER
	#define ALIGN(n) __declspec(align(n))
	#define inline __inline
	#define restrict __restrict
#elif __STDC_VERSION__ >= 199901L   /* C99: delete align, keep others      */
	#define ALIGN(n)
#else /* Not GNU/Microsoft/C99: delete alignment/inline/restrict uses.     */
	#define ALIGN(n)
	#define inline
	#define restrict
#endif

/* How to endian reverse a uint64_t                                        */
#if _MSC_VER
    #define bswap64(x) _byteswap_uint64(x)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)) && !__arm__
    #define bswap64(x) __builtin_bswap64(x)
#elif __GNUC__ && __amd64__
    #define bswap64(x) ({uint64_t y=x;__asm__("bswapq %0":"+r"(y));y;})
#else

/* Build bswap64 out of two bswap32's                                      */
#if __GNUC__ && (__ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ ||    \
    __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ || __ARM_ARCH_6T2__ ||              \
    __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7M__)
	#define bswap32(x) ({uint32_t y; __asm__("rev %0, %1":"=r"(y):"r"(x));y;})
#elif __GNUC__ && __arm__
	#define bswap32(x)                             \
		({uint32_t t,y;                            \
		__asm__("eor     %1, %2, %2, ror #16\n\t" \
				"bic     %1, %1, #0x00FF0000\n\t" \
				"mov     %0, %2, ror #8\n\t"      \
				"eor     %0, %0, %1, lsr #8"      \
				: "=r"(y), "=&r"(t) : "r"(x));y;})
#elif __GNUC__ && __i386__
	#define bswap32(x) ({uint64_t y=x;__asm__("bswap %0":"+r"(y));y;})
#else        /* Some compilers recognize the following pattern */
	#define bswap32(x)                         \
	   ((((x) & 0xff000000u) >> 24) | \
		(((x) & 0x00ff0000u) >>  8) | \
		(((x) & 0x0000ff00u) <<  8) | \
		(((x) & 0x000000ffu) << 24))
#endif

static inline uint64_t bswap64(uint64_t x) {
	union { uint64_t ll; uint32_t l[2]; } w, r;
	w.ll = x;
	r.l[0] = bswap32(w.l[1]);
	r.l[1] = bswap32(w.l[0]);
	return r.ll;
}

#endif

#if _MSC_VER
    #define bswap32(x) _byteswap_uint(x)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)) && !__arm__
    #define bswap32(x) ((uint32_t)__builtin_bswap32((int32_t)(x)))
#elif __GNUC__ && (__ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ ||    \
    __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ || __ARM_ARCH_6T2__ ||              \
    __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7M__)
	#define bswap32(x) ({uint32_t y; __asm__("rev %0, %1":"=r"(y):"r"(x));y;})
#elif __GNUC__ && __arm__
	#define bswap32(x)                             \
		({uint32_t t,y;                            \
		__asm__("eor     %1, %2, %2, ror #16\n\t" \
				"bic     %1, %1, #0x00FF0000\n\t" \
				"mov     %0, %2, ror #8\n\t"      \
				"eor     %0, %0, %1, lsr #8"      \
				: "=r"(y), "=&r"(t) : "r"(x));y;})
#elif __GNUC__ && __i386__
	#define bswap32(x) ({uint64_t y=x;__asm__("bswap %0":"+r"(y));y;})
#else        /* Some compilers recognize the following pattern */
	#define bswap32(x)                         \
	   ((((x) & UINT32_C(0xff000000)) >> 24) | \
		(((x) & UINT32_C(0x00ff0000)) >>  8) | \
		(((x) & UINT32_C(0x0000ff00)) <<  8) | \
		(((x) & UINT32_C(0x000000ff)) << 24))
#endif

static inline uint32_t bswap32_if_le(uint32_t x)
{
	const union { unsigned x; unsigned char endian; } little = { 1 };
	return (little.endian?bswap32(x):x);
}

/* ----------------------------------------------------------------------- */
/* Define blocks and operationss -- Patch if incorrect on your compiler.   */
/* ----------------------------------------------------------------------- */

#if USE_SSE2
    #include <xmmintrin.h>        /* SSE instructions and _mm_malloc */
    #include <emmintrin.h>        /* SSE2 instructions               */
    typedef ALIGN(16) __m128i block;
    #define add_one(b)            _mm_add_epi32(b,_mm_set_epi32(1,0,0,0))
    #define xor_block(x, y)       _mm_xor_si128(x,y)
    #define zero_block()          _mm_setzero_si128()
    #define unequal_blocks(x, y) \
    					   (_mm_movemask_epi8(_mm_cmpeq_epi8(x,y)) != 0xffff)
	#if __SSSE3__
    #include <tmmintrin.h>        /* SSSE3 instructions              */
    #define swap_if_le(b) \
      _mm_shuffle_epi8(b,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))
	#else
    static inline block swap_if_le(block b) {
		block a = _mm_shuffle_epi32  (b, _MM_SHUFFLE(0,1,2,3));
		a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2,3,0,1));
		a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1));
		return _mm_xor_si128(_mm_srli_epi16(a,8), _mm_slli_epi16(a,8));
    }
	#endif
#elif USE_ALTIVEC
    #include <altivec.h>
    typedef ALIGN(16) vector unsigned block;
    static inline block add_one(block b) {const vector unsigned int one = {0,0,0,1}; return vec_add(b,one);}
    #define xor_block(x,y)        vec_xor(x,y)
    #define zero_block()          vec_splat_u32(0)
    #define unequal_blocks(x,y)   vec_any_ne(x,y)
    #define swap_if_le(b)         (b)
#else
    typedef struct { uint64_t l,r; } block;
    static block add_one(block x)                         {x.r+=1; return x;}
    static block xor_block(block x, block y)  {x.l^=y.l; x.r^=y.r; return x;}
    static block zero_block(void)        { const block t = {0,0}; return t; }
    #define unequal_blocks(x, y)         ((((x).l^(y).l)|((x).r^(y).r)) != 0)
    static inline block swap_if_le(block b) {
		const union { unsigned x; unsigned char endian; } little = { 1 };
    	if (little.endian) {
			block a;
			a.l = bswap64(b.l);
			a.r = bswap64(b.r);
			return a;
    	} else
    		return b;
    }
#endif

/* Sometimes it is useful to view a block as an array of other types.
/  Doing so is technically undefined, but well supported in compilers.     */
typedef union {
	uint64_t u64[2]; uint32_t u32[4]; uint8_t u8[16]; block bl;
} block_multiview;

/* ----------------------------------------------------------------------- */
/* AES - Code uses OpenSSL API. Other implementations get mapped to it.    */
/* ----------------------------------------------------------------------- */

/*---------------*/
#if USE_OPENSSL_AES
/*---------------*/

#include <openssl/aes.h>                            /* http://openssl.org/ */

/*-----------------*/
#elif USE_OPENSSL_AES_NI
/*-----------------*/

#include <openssl/aes.h>                            /* http://openssl.org/ */
#include <wmmintrin.h>
#include "aesni-openssl.h"

#define AES_set_encrypt_key aesni_set_encrypt_key
#define AES_set_decrypt_key aesni_set_decrypt_key
#define AES_encrypt         aesni_encrypt
#define AES_decrypt         aesni_decrypt

/*-----------------*/
#elif USE_KASPER_AES
/*-----------------*/

typedef struct { ALIGN(16) uint32_t bs_key[11][32]; uint32_t counter[4]; } AES_KEY;

void ECRYPT_keysetup(
  AES_KEY* ctx, 
  const uint8_t* key, 
  uint32_t keysize,                /* Key size in bits. */ 
  uint32_t ivsize);                /* IV size in bits. */ 

void ECRYPT_ivsetup(
  AES_KEY* ctx, 
  const uint8_t* iv);

void ECRYPT_process_bytes(
  int action,                 /* 0 = encrypt; 1 = decrypt; */
  AES_KEY* ctx, 
  const uint8_t* input, 
  uint8_t* output, 
  uint32_t msglen);                /* Message length in bytes. */ 

int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	ECRYPT_keysetup(key, userKey, bits, 128);
	return 0;
}

/*-----------------*/
#elif USE_VIA_ACE_AES
/*-----------------*/

typedef struct { ALIGN(16) char str[16], cword[16]; } AES_KEY;

static inline
void via_xcryptctr(void *in, void *out, void *iv, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptctr"
	        : "+S"(in), "+D"(out), "+c"(nblks), "+a"(iv)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
static inline
void via_xcryptcbc(void *in, void *out, void *iv, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptcbc"
	        : "+S"(in), "+D"(out), "+c"(nblks), "+a"(iv)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
static inline
void via_xcryptecb(void *in, void *out, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptecb"
	        : "+S"(in), "+D"(out), "+c"(nblks)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
#define AES_encrypt(x,y,z)       via_xcryptecb(x,y,1,z)
#define AES_decrypt(x,y,z)       via_xcryptecb(x,y,1,z)

int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	__asm__ __volatile__ ("pushf\n\tpopf" : : : "cc"); /* Indicate new key */
	memcpy(key->str,userKey,bits/8);
	memset(key->cword,0,sizeof(key->cword));
	key->cword[0] = 10; /* Set ROUND bits to 10 */
	return 0;
}
int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	int rval = AES_set_encrypt_key(userKey, bits, key);
	key->cword[1]=2; /* Set CRYPT bit for decryption */
	return rval;
}

/*-------------------*/
#elif USE_REFERENCE_AES
/*-------------------*/

#include "rijndael-alg-fst.h"              /* Barreto's Public-Domain Code */
typedef struct { uint32_t rd_key[MAX_KEY_BYTES+28]; int rounds; } AES_KEY;
#define AES_encrypt(x,y,z)    rijndaelEncrypt((z)->rd_key, (z)->rounds, x, y)
#define AES_decrypt(x,y,z)    rijndaelDecrypt((z)->rd_key, (z)->rounds, x, y)
#define AES_set_encrypt_key(x, y, z) \
 do {rijndaelKeySetupEnc((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
#define AES_set_decrypt_key(x, y, z) \
 do {rijndaelKeySetupDec((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)

#endif

/*----------*/
#if USE_AES_NI        /* It is acceptable that USE_OPENSSL_AES is true too */
/*----------*/

#include <wmmintrin.h>
#define AES_encrypt AES_encrypt_ni /* Avoid name conflict in openssl/aes.h */
#define AES_decrypt AES_decrypt_ni /* Avoid name conflict in openssl/aes.h */

#if USE_OPENSSL_AES       /* Use OpenSSL's key setup instead of intrinsics */

#define AES_ROUNDS(_key)  ((_key).rounds)

#else /* !USE_OPENSSL_AES -- Use intrinsics for key setup. AES-128 only    */

typedef struct { __m128i rd_key[7+MAX_KEY_BYTES/4]; } AES_KEY;
#define AES_ROUNDS(_key)  (10)
static __m128i assist128(__m128i a, __m128i b)
{
    __m128i tmp = _mm_slli_si128 (a, 0x04);
    a = _mm_xor_si128 (a, tmp);
    tmp = _mm_slli_si128 (tmp, 0x04);
    a = _mm_xor_si128 (_mm_xor_si128 (a, tmp), _mm_slli_si128 (tmp, 0x04));
    return _mm_xor_si128 (a, _mm_shuffle_epi32 (b ,0xff));
}
static void AES_set_encrypt_key(const unsigned char *userKey,
                                const int bits, AES_KEY *key)
{
    __m128i *sched = key->rd_key;
    (void)bits; /* Supress "unused" warning */
    sched[ 0] = _mm_loadu_si128((__m128i*)userKey);
    sched[ 1] = assist128(sched[0], _mm_aeskeygenassist_si128(sched[0],0x1));
    sched[ 2] = assist128(sched[1], _mm_aeskeygenassist_si128(sched[1],0x2));
    sched[ 3] = assist128(sched[2], _mm_aeskeygenassist_si128(sched[2],0x4));
    sched[ 4] = assist128(sched[3], _mm_aeskeygenassist_si128(sched[3],0x8));
    sched[ 5] = assist128(sched[4], _mm_aeskeygenassist_si128(sched[4],0x10));
    sched[ 6] = assist128(sched[5], _mm_aeskeygenassist_si128(sched[5],0x20));
    sched[ 7] = assist128(sched[6], _mm_aeskeygenassist_si128(sched[6],0x40));
    sched[ 8] = assist128(sched[7], _mm_aeskeygenassist_si128(sched[7],0x80));
    sched[ 9] = assist128(sched[8], _mm_aeskeygenassist_si128(sched[8],0x1b));
    sched[10] = assist128(sched[9], _mm_aeskeygenassist_si128(sched[9],0x36));
}
static void AES_NI_set_decrypt_key(__m128i *dkey, const __m128i *ekey)
{
    int i;
    dkey[10] = ekey[0];
    for (i = 1; i <= 9; i++) dkey[10-i] = _mm_aesimc_si128(ekey[i]);
    dkey[0] = ekey[10];
}

#endif  /* !USE_OPENSSL_AES */

static inline void AES_encrypt(const unsigned char *in,
                        unsigned char *out, const AES_KEY *key)
{
	int j;
	const __m128i *sched = ((__m128i *)(key->rd_key));
	__m128i tmp = _mm_load_si128 ((__m128i*)in);
	tmp = _mm_xor_si128 (tmp,sched[0]);
	for (j=1; j<AES_ROUNDS(*key); j++)  tmp = _mm_aesenc_si128 (tmp,sched[j]);
	tmp = _mm_aesenclast_si128 (tmp,sched[j]);
	_mm_store_si128 ((__m128i*)out,tmp);
}
static inline void AES_decrypt(const unsigned char *in,
                        unsigned char *out, const AES_KEY *key)
{
	int j;
	const __m128i *sched = ((__m128i *)(key->rd_key));
	__m128i tmp = _mm_load_si128 ((__m128i*)in);
	tmp = _mm_xor_si128 (tmp,sched[0]);
	for (j=1; j<AES_ROUNDS(*key); j++)  tmp = _mm_aesdec_si128 (tmp,sched[j]);
	tmp = _mm_aesdeclast_si128 (tmp,sched[j]);
	_mm_store_si128 ((__m128i*)out,tmp);
}

#endif

/* ----------------------------------------------------------------------- */

struct _ae_ctx {
    AES_KEY encrypt_key;
};

/* ----------------------------------------------------------------------- */

int ae_init(ae_ctx *ctx, const void *key, int key_len, int nonce_len, int tag_len)
{
    /* Initialize encryption & decryption keys */
    AES_set_encrypt_key((unsigned char *)key, key_len*8, &ctx->encrypt_key);
    return AE_SUCCESS;
}

/* ----------------------------------------------------------------------- */
#if USE_VIA_ACE_AES
/* ----------------------------------------------------------------------- */

int ae_encrypt(ae_ctx     * restrict ctx,
               const void * restrict nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       * restrict tag,
               int         final)
{
	block_multiview tmp, ctr;
	int blocks = (pt_len+15)/16;
    block * restrict ctp = (block *)ct;
    block * restrict ptp = (block *)pt;
	pt_len += 16;
	ctr.bl = *(block *)nonce;
	via_xcryptctr(ptp, ctp, ctr.u8, blocks, &ctx->encrypt_key);
	return pt_len;
}

/*-----------------*/
#elif USE_KASPER_AES
/*-----------------*/

int ae_encrypt(ae_ctx     *ctx,
               const void *nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       *tag,
               int         final)
{
	if (nonce)
		ECRYPT_ivsetup(&ctx->encrypt_key, nonce);
		
	#if SAFE_OUTPUT_BUFFERS
	ECRYPT_process_bytes(0, &ctx->encrypt_key, pt, ct, (pt_len+127)&~127);
	#else
	ECRYPT_process_bytes(0, &ctx->encrypt_key, pt, ct, pt_len);
	#endif
    
	return pt_len;	
}

/* ----------------------------------------------------------------------- */
#elif USE_OPENSSL_AES_NI
/* ----------------------------------------------------------------------- */

#include "aesni-openssl.h"

int ae_encrypt(ae_ctx     * restrict ctx,
               const void * restrict nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       * restrict tag,
               int         final)
{
	#if SAFE_OUTPUT_BUFFERS
	aesni_ctr32_encrypt_blocks(pt,ct,(pt_len+15)/16,&ctx->encrypt_key,nonce);
	#else
    union { unsigned char u8[16]; uint32_t u32[4]; block bl; } ctr;
    unsigned remaining;
	aesni_ctr32_encrypt_blocks(pt,ct,pt_len/16,&ctx->encrypt_key,nonce);
	remaining = pt_len % 16;
	if (remaining) {
		ctr.bl = *(block *)nonce;
		ctr.u32[3] += pt_len/16;
		aesni_encrypt(ctr.u8,ctr.u8,&ctx->encrypt_key);
		ctr.bl = xor_block(ctr.bl,((block *)pt)[pt_len/16]);
		memcpy((block *)ct+(pt_len/16),ctr.u8,remaining);
	}
	#endif
    return (int) pt_len;
}

/* ----------------------------------------------------------------------- */
#else
/* ----------------------------------------------------------------------- */

#if USE_AES_NI
#error -- Use aesni-x86 instead! This one is broken.
#define ENCRYPT_4_BLOCKS(t1,t2,t3,t4,ctx)                                  \
	t1 =_mm_xor_si128(t1, ((block*)(ctx->encrypt_key.rd_key))[0]);         \
	t2 =_mm_xor_si128(t2, ((block*)(ctx->encrypt_key.rd_key))[0]);         \
	t3 =_mm_xor_si128(t3, ((block*)(ctx->encrypt_key.rd_key))[0]);         \
	t4 =_mm_xor_si128(t4, ((block*)(ctx->encrypt_key.rd_key))[0]);         \
	for(j=1; j<AES_ROUNDS(ctx->encrypt_key); j+=1) {                       \
		t1 = _mm_aesenc_si128(t1, ((block*)(ctx->encrypt_key.rd_key))[j]); \
		t2 = _mm_aesenc_si128(t2, ((block*)(ctx->encrypt_key.rd_key))[j]); \
		t3 = _mm_aesenc_si128(t3, ((block*)(ctx->encrypt_key.rd_key))[j]); \
		t4 = _mm_aesenc_si128(t4, ((block*)(ctx->encrypt_key.rd_key))[j]); \
	}                                                                      \
	t1 =_mm_aesenclast_si128(t1, ((block*)(ctx->encrypt_key.rd_key))[j]);  \
	t2 =_mm_aesenclast_si128(t2, ((block*)(ctx->encrypt_key.rd_key))[j]);  \
	t3 =_mm_aesenclast_si128(t3, ((block*)(ctx->encrypt_key.rd_key))[j]);  \
	t4 =_mm_aesenclast_si128(t4, ((block*)(ctx->encrypt_key.rd_key))[j]);  \

int ae_encrypt(ae_ctx     * restrict ctx,
               const void * restrict nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       * restrict tag,
               int         final)
{
    unsigned iters, j, remaining;
	block i1,i2,i3,i4;
    block       * restrict ctp = (block *)ct;
    const block * restrict ptp = (block *)pt;

	/* pt_len += 16; */
	i4 = *(block *)nonce;
	
	for (iters=pt_len/64; iters != 0; --iters) {
		i1 = add_one(i4);
		i2 = add_one(i1);
		i3 = add_one(i2);
		i4 = add_one(i3);
		ENCRYPT_4_BLOCKS(i1,i2,i3,i4,ctx);
		ctp[0] = xor_block(i1,ptp[0]);
		ctp[1] = xor_block(i2,ptp[1]);
		ctp[2] = xor_block(i3,ptp[2]);
		ctp[3] = xor_block(i4,ptp[3]);
		ptp += 4;
		ctp += 4;
	}
	remaining = pt_len % 64;
	while (remaining >= 16) {
		i4 = add_one(i4);
		AES_encrypt((unsigned char *)&i4,(unsigned char *)&i4,&ctx->encrypt_key);
		ctp[0] = xor_block(i4,ptp[0]);
		ptp += 1;
		ctp += 1;
		remaining -= 16;
	}
	if (remaining) {
		block in;
		i4 = add_one(i4);
		AES_encrypt((unsigned char *)&i4,(unsigned char *)&i4,&ctx->encrypt_key);
		in = xor_block(i4,*ptp);
		memcpy(ctp,&in,remaining);
	}
    return (int) pt_len;
}

#else

int ae_encrypt(ae_ctx     * restrict ctx,
               const void * restrict nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       * restrict tag,
               int         final)
{
    unsigned iters, j, remaining;
	block i[4], o[4];
	uint32_t ctr;
    block       * restrict ctp = (block *)ct;
    const block * restrict ptp = (block *)pt;
    int rval = pt_len;

	#if SAFE_OUTPUT_BUFFERS
	pt_len = (pt_len+15) & ~15; /* Round up to nearest 16-byte value */
	#endif


	/* pt_len += 16; */
	i[0] = i[1] = i[2] = i[3] = *(block *)nonce;
	ctr = ((uint32_t *)nonce)[3];
	
	for (iters=pt_len/64; iters != 0; --iters) {
		((uint32_t *)i)[3] = ctr;
		((uint32_t *)(i+1))[3] = ctr+1;
		((uint32_t *)(i+2))[3] = ctr+2;
		((uint32_t *)(i+3))[3] = ctr+3;
		ctr += 4;
		AES_encrypt((unsigned char *)i, (unsigned char *)o, &ctx->encrypt_key);              \
		AES_encrypt((unsigned char *)(i+1), (unsigned char *)(o+1), &ctx->encrypt_key);              \
		AES_encrypt((unsigned char *)(i+2), (unsigned char *)(o+2), &ctx->encrypt_key);              \
		AES_encrypt((unsigned char *)(i+3), (unsigned char *)(o+3), &ctx->encrypt_key);              \
		ctp[0] = xor_block(o[0],ptp[0]);
		ctp[1] = xor_block(o[1],ptp[1]);
		ctp[2] = xor_block(o[2],ptp[2]);
		ctp[3] = xor_block(o[3],ptp[3]);
		ptp += 4;
		ctp += 4;
	}
	remaining = pt_len % 64;
	while (remaining >= 16) {
		((uint32_t *)i)[3] = ctr;
		ctr += 1;
		AES_encrypt((unsigned char *)i, (unsigned char *)o, &ctx->encrypt_key);              \
		ctp[0] = xor_block(o[0],ptp[0]);
		ptp += 1;
		ctp += 1;
		remaining -= 16;
	}
	#if (! SAFE_OUTPUT_BUFFERS)
	if (remaining) {
		((uint32_t *)i)[3] = ctr;
		AES_encrypt((unsigned char *)i, (unsigned char *)o, &ctx->encrypt_key);              \
		o[0] = xor_block(o[0],ptp[0]);
		memcpy(ctp,o,remaining);
	}
	#endif
    return (int) rval;
}

#endif

#endif


/* ----------------------------------------------------------------------- */
/* Public functions                                                        */
/* ----------------------------------------------------------------------- */

/* Some systems do not 16-byte-align dynamic allocations involving 16-byte
/  vectors. Adjust the following if your system is one of these            */

/* These determine how to allocate 16-byte aligned vectors, if needed.     */
#define USE_MM_MALLOC      (USE_SSE2 && !(_M_X64 || __amd64__))
#define USE_POSIX_MEMALIGN (USE_ALTIVEC && __GLIBC__ && !__PPC64__)

ae_ctx* ae_allocate(void *misc)
{ 
	void *p;
	(void) misc;                     /* misc unused in this implementation */
	#if USE_MM_MALLOC
    	p = _mm_malloc(sizeof(ae_ctx),16); 
	#elif USE_POSIX_MEMALIGN
		if (posix_memalign(&p,16,sizeof(ae_ctx)) != 0) p = NULL;
	#else
		p = malloc(sizeof(ae_ctx)); 
	#endif
	return (ae_ctx *)p;
}

void ae_free(ae_ctx *ctx)
{
	#if USE_MM_MALLOC
		_mm_free(ctx);
	#else
		free(ctx);
	#endif
}

int ae_clear (ae_ctx *ctx) /* Zero ae_ctx and undo initialization          */
{
	memset(ctx, 0, sizeof(ae_ctx));
	return AE_SUCCESS;
}

int ae_ctx_sizeof(void) { return (int) sizeof(ae_ctx); }







#if USE_OPENSSL_AES_NI
char infoString[] = "CTR (AES-NI OpenSSL)";
#elif USE_KASPER_AES
char infoString[] = "CTR (Kasper)";
#elif USE_VIA_ACE_AES
char infoString[] = "CTR (VIA ACE)";
#elif USE_AES_NI
char infoString[] = "CTR (AES-NI)";
#elif USE_CRYPTOPP_AES
char infoString[] = "CTR (Crypto++)";
#elif USE_REFERENCE_AES
char infoString[] = "CTR (Reference)";
#elif USE_OPENSSL_AES
char infoString[] = "CTR (OpenSSL)";
#else
char infoString[] = "CTR";
#endif
