#if !(USE_OPENSSL_AES || USE_OPENSSL_AES_NI || USE_REFERENCE_AES ||           \
      USE_AES_NI || USE_VIA_ACE_AES || USE_KASPER_AES)
#define USE_OPENSSL_AES            0         /* http://openssl.org         */
#define USE_OPENSSL_AES_NI         1         /* http://openssl.org         */
#define USE_REFERENCE_AES          0         /* Google: rijndael-alg-fst.c */
#define USE_AES_NI                 0         /* Uses compiler's intrinsics */
#define USE_VIA_ACE_AES            0
#define USE_KASPER_AES             0
#endif

/* MAX_KEY_BYTES specifies the maximum size key you intend to supply OCB, and
/  *must* be 16, 24, or 32. In *some* AES implementations it is possible to
/  limit internal key-schedule sizes, so keep this as small as possible.   */
#define MAX_KEY_BYTES             16

/* To eliminate the use of vector types, set the following non-zero        */
#define VECTORS_OFF                0

/* ----------------------------------------------------------------------- */
/* Derived configuration options - Adjust as needed                        */
/* ----------------------------------------------------------------------- */

/* These determine whether vectors should be used.                         */
#define USE_SSE2    ((__SSE2__ || (_M_IX86_FP>=2) || _M_X64) && !VECTORS_OFF)
#define USE_ALTIVEC (__ALTIVEC__ && !VECTORS_OFF)

/* ----------------------------------------------------------------------- */
/* Includes and compiler specific definitions                              */
/* ----------------------------------------------------------------------- */

#include "ae.h"
#include <stdlib.h>
#include <string.h>

/* Define standard sized integers                                          */
#if defined(_MSC_VER) && (_MSC_VER < 1600)
	typedef unsigned __int8  uint8_t;
	typedef unsigned __int32 uint32_t;
	typedef unsigned __int64 uint64_t;
	typedef          __int64 int64_t;
#else
	#include <stdint.h>
#endif

/* How to force specific alignment, request inline, restrict pointers      */
#if __GNUC__
	#define ALIGN(n) __attribute__ ((aligned(n)))
	#define inline __inline__
	#define restrict __restrict__
#elif _MSC_VER
	#define ALIGN(n) __declspec(align(n))
	#define inline __inline
	#define restrict __restrict
#elif __STDC_VERSION__ >= 199901L   /* C99: delete align, keep others      */
	#define ALIGN(n)
#else /* Not GNU/Microsoft/C99: delete alignment/inline/restrict uses.     */
	#define ALIGN(n)
	#define inline
	#define restrict
#endif

/* How to endian reverse a uint64_t                                        */
#if _MSC_VER
    #define bswap64(x) _byteswap_uint64(x)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)) && !__arm__
    #define bswap64(x) __builtin_bswap64(x)
#elif __GNUC__ && __amd64__
    #define bswap64(x) ({uint64_t y=x;__asm__("bswapq %0":"+r"(y));y;})
#else

/* Build bswap64 out of two bswap32's                                      */
#if __GNUC__ && (__ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ ||    \
    __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ || __ARM_ARCH_6T2__ ||              \
    __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7M__)
	#define bswap32(x) ({uint32_t y; __asm__("rev %0, %1":"=r"(y):"r"(x));y;})
#elif __GNUC__ && __arm__
	#define bswap32(x)                             \
		({uint32_t t,y;                            \
		__asm__("eor     %1, %2, %2, ror #16\n\t" \
				"bic     %1, %1, #0x00FF0000\n\t" \
				"mov     %0, %2, ror #8\n\t"      \
				"eor     %0, %0, %1, lsr #8"      \
				: "=r"(y), "=&r"(t) : "r"(x));y;})
#elif __GNUC__ && __i386__
	#define bswap32(x) ({uint64_t y=x;__asm__("bswap %0":"+r"(y));y;})
#else        /* Some compilers recognize the following pattern */
	#define bswap32(x)                         \
	   ((((x) & 0xff000000u) >> 24) | \
		(((x) & 0x00ff0000u) >>  8) | \
		(((x) & 0x0000ff00u) <<  8) | \
		(((x) & 0x000000ffu) << 24))
#endif

static inline uint64_t bswap64(uint64_t x) {
	union { uint64_t ll; uint32_t l[2]; } w, r;
	w.ll = x;
	r.l[0] = bswap32(w.l[1]);
	r.l[1] = bswap32(w.l[0]);
	return r.ll;
}

#endif

#if _MSC_VER
    #define bswap32(x) _byteswap_uint(x)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)) && !__arm__
    #define bswap32(x) ((uint32_t)__builtin_bswap32((int32_t)(x)))
#elif __GNUC__ && (__ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ ||    \
    __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__ || __ARM_ARCH_6T2__ ||              \
    __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7M__)
	#define bswap32(x) ({uint32_t y; __asm__("rev %0, %1":"=r"(y):"r"(x));y;})
#elif __GNUC__ && __arm__
	#define bswap32(x)                             \
		({uint32_t t,y;                            \
		__asm__("eor     %1, %2, %2, ror #16\n\t" \
				"bic     %1, %1, #0x00FF0000\n\t" \
				"mov     %0, %2, ror #8\n\t"      \
				"eor     %0, %0, %1, lsr #8"      \
				: "=r"(y), "=&r"(t) : "r"(x));y;})
#elif __GNUC__ && __i386__
	#define bswap32(x) ({uint64_t y=x;__asm__("bswap %0":"+r"(y));y;})
#else        /* Some compilers recognize the following pattern */
	#define bswap32(x)                         \
	   ((((x) & UINT32_C(0xff000000)) >> 24) | \
		(((x) & UINT32_C(0x00ff0000)) >>  8) | \
		(((x) & UINT32_C(0x0000ff00)) <<  8) | \
		(((x) & UINT32_C(0x000000ff)) << 24))
#endif

static inline uint32_t bswap32_if_le(uint32_t x)
{
	const union { unsigned x; unsigned char endian; } little = { 1 };
	return (little.endian?bswap32(x):x);
}

/* ----------------------------------------------------------------------- */
/* Define blocks and operationss -- Patch if incorrect on your compiler.   */
/* ----------------------------------------------------------------------- */

#if USE_SSE2
    #include <xmmintrin.h>        /* SSE instructions and _mm_malloc */
    #include <emmintrin.h>        /* SSE2 instructions               */
    typedef ALIGN(16) __m128i block;
    #define add_one(b)            _mm_add_epi32(b,_mm_set_epi32(1,0,0,0))
    #define xor_block(x, y)       _mm_xor_si128(x,y)
    #define zero_block()          _mm_setzero_si128()
    #define unequal_blocks(x, y) \
    					   (_mm_movemask_epi8(_mm_cmpeq_epi8(x,y)) != 0xffff)
	#if __SSSE3__
    #include <tmmintrin.h>        /* SSSE3 instructions              */
    #define swap_if_le(b) \
      _mm_shuffle_epi8(b,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))
	#else
    static inline block swap_if_le(block b) {
		block a = _mm_shuffle_epi32  (b, _MM_SHUFFLE(0,1,2,3));
		a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2,3,0,1));
		a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1));
		return _mm_xor_si128(_mm_srli_epi16(a,8), _mm_slli_epi16(a,8));
    }
	#endif
#elif USE_ALTIVEC
    #include <altivec.h>
    typedef ALIGN(16) vector unsigned block;
    #define xor_block(x,y)        vec_xor(x,y)
    #define zero_block()          vec_splat_u32(0)
    #define unequal_blocks(x,y)   vec_any_ne(x,y)
    #define swap_if_le(b)         (b)
#else
    typedef struct { uint64_t l,r; } block;
    static block xor_block(block x, block y)  {x.l^=y.l; x.r^=y.r; return x;}
    static block zero_block(void)        { const block t = {0,0}; return t; }
    #define unequal_blocks(x, y)         ((((x).l^(y).l)|((x).r^(y).r)) != 0)
    static inline block swap_if_le(block b) {
		const union { unsigned x; unsigned char endian; } little = { 1 };
    	if (little.endian) {
			block a;
			a.l = bswap64(b.l);
			a.r = bswap64(b.r);
			return a;
    	} else
    		return b;
    }
#endif

/* Sometimes it is useful to view a block as an array of other types.
/  Doing so is technically undefined, but well supported in compilers.     */
typedef union {
	uint64_t u64[2]; uint32_t u32[4]; uint8_t u8[16]; block bl;
} block_multiview;

/* ----------------------------------------------------------------------- */
/* AES - Code uses OpenSSL API. Other implementations get mapped to it.    */
/* ----------------------------------------------------------------------- */

/*---------------*/
#if USE_OPENSSL_AES
/*---------------*/

#include <openssl/aes.h>                            /* http://openssl.org/ */

/*-----------------*/
#elif USE_OPENSSL_AES_NI
/*-----------------*/

#include <openssl/aes.h>                            /* http://openssl.org/ */
#include <wmmintrin.h>
#include "aesni-openssl.h"

#define AES_set_encrypt_key aesni_set_encrypt_key
#define AES_set_decrypt_key aesni_set_decrypt_key
#define AES_encrypt         aesni_encrypt
#define AES_decrypt         aesni_decrypt

/*-----------------*/
#elif USE_KASPER_AES
/*-----------------*/

#if 0
typedef struct { ALIGN(16) uint32_t bs_key[11][32]; uint32_t counter[4]; } AES_KEY;

void ECRYPT_keysetup(
  AES_KEY* ctx, 
  const uint8_t* key, 
  uint32_t keysize,                /* Key size in bits. */ 
  uint32_t ivsize);                /* IV size in bits. */ 

void ECRYPT_ivsetup(
  AES_KEY* ctx, 
  const uint8_t* iv);

void ECRYPT_process_bytes(
  int action,                 /* 0 = encrypt; 1 = decrypt; */
  AES_KEY* ctx, 
  const uint8_t* input, 
  uint8_t* output, 
  uint32_t msglen);                /* Message length in bytes. */ 

int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	ECRYPT_keysetup(key, userKey, bits, 128);
	return 0;
}
#endif

/*-----------------*/
#elif USE_VIA_ACE_AES
/*-----------------*/

typedef struct { ALIGN(16) char str[16], cword[16]; } AES_KEY;

static inline
void via_xcryptctr(void *in, void *out, void *iv, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptctr"
	        : "+S"(in), "+D"(out), "+c"(nblks), "+a"(iv)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
static inline
void via_xcryptcbc(void *in, void *out, void *iv, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptcbc"
	        : "+S"(in), "+D"(out), "+c"(nblks), "+a"(iv)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
static inline
void via_xcryptecb(void *in, void *out, int nblks, const AES_KEY *key)
{
	__asm__ __volatile__("xcryptecb"
	        : "+S"(in), "+D"(out), "+c"(nblks)
	        : "d"(key->cword), "b"(key->str) : "memory");	
}
#define AES_encrypt(x,y,z)       via_xcryptecb(x,y,1,z)
#define AES_decrypt(x,y,z)       via_xcryptecb(x,y,1,z)

int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	__asm__ __volatile__ ("pushf\n\tpopf" : : : "cc"); /* Indicate new key */
	memcpy(key->str,userKey,bits/8);
	memset(key->cword,0,sizeof(key->cword));
	key->cword[0] = 10; /* Set ROUND bits to 10 */
	return 0;
}
int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) {
	int rval = AES_set_encrypt_key(userKey, bits, key);
	key->cword[1]=2; /* Set CRYPT bit for decryption */
	return rval;
}

/*-------------------*/
#elif USE_REFERENCE_AES
/*-------------------*/

#include "rijndael-alg-fst.h"              /* Barreto's Public-Domain Code */
typedef struct { uint32_t rd_key[MAX_KEY_BYTES+28]; int rounds; } AES_KEY;
#define AES_encrypt(x,y,z)    rijndaelEncrypt((z)->rd_key, (z)->rounds, x, y)
#define AES_decrypt(x,y,z)    rijndaelDecrypt((z)->rd_key, (z)->rounds, x, y)
#define AES_set_encrypt_key(x, y, z) \
 do {rijndaelKeySetupEnc((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)
#define AES_set_decrypt_key(x, y, z) \
 do {rijndaelKeySetupDec((z)->rd_key, x, y); (z)->rounds = y/32+6;} while (0)

#endif

/*----------*/
#if USE_AES_NI        /* It is acceptable that USE_OPENSSL_AES is true too */
/*----------*/

#include <wmmintrin.h>
#include <smmintrin.h>
#define AES_encrypt AES_encrypt_ni /* Avoid name conflict in openssl/aes.h */
#define AES_decrypt AES_decrypt_ni /* Avoid name conflict in openssl/aes.h */

#if USE_OPENSSL_AES       /* Use OpenSSL's key setup instead of intrinsics */

#define AES_ROUNDS(_key)  ((_key).rounds)

#else /* !USE_OPENSSL_AES -- Use intrinsics for key setup. AES-128 only    */

typedef struct { __m128i rd_key[7+MAX_KEY_BYTES/4]; } AES_KEY;
#define AES_ROUNDS(_key)  (10)
static __m128i assist128(__m128i a, __m128i b)
{
    __m128i tmp = _mm_slli_si128 (a, 0x04);
    a = _mm_xor_si128 (a, tmp);
    tmp = _mm_slli_si128 (tmp, 0x04);
    a = _mm_xor_si128 (_mm_xor_si128 (a, tmp), _mm_slli_si128 (tmp, 0x04));
    return _mm_xor_si128 (a, _mm_shuffle_epi32 (b ,0xff));
}
static void AES_set_encrypt_key(const unsigned char *userKey,
                                const int bits, AES_KEY *key)
{
    __m128i *sched = key->rd_key;
    (void)bits; /* Supress "unused" warning */
    sched[ 0] = _mm_loadu_si128((__m128i*)userKey);
    sched[ 1] = assist128(sched[0], _mm_aeskeygenassist_si128(sched[0],0x1));
    sched[ 2] = assist128(sched[1], _mm_aeskeygenassist_si128(sched[1],0x2));
    sched[ 3] = assist128(sched[2], _mm_aeskeygenassist_si128(sched[2],0x4));
    sched[ 4] = assist128(sched[3], _mm_aeskeygenassist_si128(sched[3],0x8));
    sched[ 5] = assist128(sched[4], _mm_aeskeygenassist_si128(sched[4],0x10));
    sched[ 6] = assist128(sched[5], _mm_aeskeygenassist_si128(sched[5],0x20));
    sched[ 7] = assist128(sched[6], _mm_aeskeygenassist_si128(sched[6],0x40));
    sched[ 8] = assist128(sched[7], _mm_aeskeygenassist_si128(sched[7],0x80));
    sched[ 9] = assist128(sched[8], _mm_aeskeygenassist_si128(sched[8],0x1b));
    sched[10] = assist128(sched[9], _mm_aeskeygenassist_si128(sched[9],0x36));
}
static void AES_NI_set_decrypt_key(__m128i *dkey, const __m128i *ekey)
{
    int i;
    dkey[10] = ekey[0];
    for (i = 1; i <= 9; i++) dkey[10-i] = _mm_aesimc_si128(ekey[i]);
    dkey[0] = ekey[10];
}

#endif  /* !USE_OPENSSL_AES */

static inline void AES_encrypt(const unsigned char *in,
                        unsigned char *out, const AES_KEY *key)
{
	int j;
	const __m128i *sched = ((__m128i *)(key->rd_key));
	__m128i tmp = _mm_load_si128 ((__m128i*)in);
	tmp = _mm_xor_si128 (tmp,sched[0]);
	for (j=1; j<AES_ROUNDS(*key); j++)  tmp = _mm_aesenc_si128 (tmp,sched[j]);
	tmp = _mm_aesenclast_si128 (tmp,sched[j]);
	_mm_store_si128 ((__m128i*)out,tmp);
}
static inline void AES_decrypt(const unsigned char *in,
                        unsigned char *out, const AES_KEY *key)
{
	int j;
	const __m128i *sched = ((__m128i *)(key->rd_key));
	__m128i tmp = _mm_load_si128 ((__m128i*)in);
	tmp = _mm_xor_si128 (tmp,sched[0]);
	for (j=1; j<AES_ROUNDS(*key); j++)  tmp = _mm_aesdec_si128 (tmp,sched[j]);
	tmp = _mm_aesdeclast_si128 (tmp,sched[j]);
	_mm_store_si128 ((__m128i*)out,tmp);
}

#endif


/* ----------------------------------------------------------------------- */
#if USE_KASPER_AES
/* ----------------------------------------------------------------------- */

struct _ae_ctx {
    ALIGN(16) uint32_t bs_key[11][32];
    uint32_t counter[4];
    uint32_t authtag[4];
    uint64_t totallen;
    uint64_t padding;
    #if !NO_TABLE
    uint32_t gfmtable[2048];
    #endif
};

extern void ECRYPT_keysetup(ae_ctx *ctx, const uint8_t *key, const uint32_t keysize, const uint32_t ivsize);
extern void process_bytes(int action, const ae_ctx *ctx, const uint8_t *input, const uint8_t *output, uint32_t len);
extern void finalmul(ae_ctx *ctx, uint8_t *mac, uint8_t *ey0);
extern void tablesetup(ae_ctx *ctx, const uint8_t *h);
extern void authenticate(ae_ctx *ctx, const uint8_t *cipher, uint32_t len);

int ae_init(ae_ctx     *ctx,
            const void *key,
            int         key_len,
            int         nonce_len,
            int         tag_len)
{
    ALIGN(16) unsigned char h[16] = {0};
    memset(&ctx->counter, 0, 16);

    ECRYPT_keysetup(ctx, key, 16, 12);
    process_bytes(0, ctx, h, h, 16);
    tablesetup(ctx, h); 
	return AE_SUCCESS;
}

int ae_encrypt(ae_ctx     *ctx,
               const void *nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       *tag,
               int         final)
{
    ALIGN(16) unsigned char ey0[16] = {0};

	if (nonce) {
		memcpy(&ctx->counter, nonce, 12);
		*(((unsigned char *)&ctx->counter) + 12) = 0;
		*(((unsigned char *)&ctx->counter) + 13) = 0;
		*(((unsigned char *)&ctx->counter) + 14) = 0;
		*(((unsigned char *)&ctx->counter) + 15) = 2;
		memset(&ctx->authtag, 0, 16);
		ctx->totallen = 0;
	}
	
	process_bytes(0, ctx, pt, ct, pt_len);
	authenticate(ctx, ct, pt_len);
	
    *(((unsigned char *)&ctx->counter) + 12) = 0;
    *(((unsigned char *)&ctx->counter) + 13) = 0;
    *(((unsigned char *)&ctx->counter) + 14) = 0;
    *(((unsigned char *)&ctx->counter) + 15) = 1;
    process_bytes(0, ctx, ey0, ey0, 16);
    
	if (tag)
		finalmul(ctx, tag, ey0); 
	else {
		char tmp[16];
		finalmul(ctx, tmp, ey0); 
		memcpy((char *)ct+pt_len,tmp,16);
		pt_len += 16;
	}
	return pt_len;	
}

/* ----------------------------------------------------------------------- */
#elif USE_OPENSSL_AES || USE_OPENSSL_AES_NI
/* ----------------------------------------------------------------------- */

#include <openssl/modes.h>

/* 6 * 16 + 16 * 16 + 4 * size_t + 2 * int = 392 (on LP64) */
#define SIZEOF_GCM128_CONTEXT  (392)

struct _ae_ctx {
    ALIGN(16) char allocation[SIZEOF_GCM128_CONTEXT];
    ALIGN(16) AES_KEY aes_key;
};


int ae_init(ae_ctx     *ctx,
            const void *key,
            int         key_len,
            int         nonce_len,
            int         tag_len)
{
	AES_set_encrypt_key(key,key_len*8,&ctx->aes_key);
	CRYPTO_gcm128_init((GCM128_CONTEXT *)ctx, &ctx->aes_key, (block128_f)AES_encrypt);
	return AE_SUCCESS;
}

int ae_encrypt(ae_ctx     *ctx,
               const void *nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       *tag,
               int         final)
{
	CRYPTO_gcm128_setiv((GCM128_CONTEXT *)ctx, nonce, 12);
	if (ad_len > 0)
		CRYPTO_gcm128_aad((GCM128_CONTEXT *)ctx,ad,ad_len);
	if (pt_len > 0)
		#if USE_OPENSSL_AES_NI
		CRYPTO_gcm128_encrypt_ctr32((GCM128_CONTEXT *)ctx, pt, ct, pt_len, (ctr128_f)aesni_ctr32_encrypt_blocks);
		#else
		CRYPTO_gcm128_encrypt((GCM128_CONTEXT *)ctx, pt, ct, pt_len);
		#endif
	CRYPTO_gcm128_finish((GCM128_CONTEXT *)ctx, NULL, 0);
	if (tag)
		memcpy(tag,ctx,16);
	else {
		memcpy((char *)ct+pt_len,ctx,16);
		pt_len += 16;
	}
	return pt_len;	
}

/* ----------------------------------------------------------------------- */
#elif USE_AES_NI
/* ----------------------------------------------------------------------- */



struct _ae_ctx { 
	ALIGN(16) uint8_t KEY[16*15]; 
};

int ae_init(ae_ctx     *ctx,
            const void *key,
            int         key_len,
            int         nonce_len,
            int         tag_len)
{
	AES_set_encrypt_key(key,key_len*8,(AES_KEY *)ctx->KEY);
	return AE_SUCCESS;
}

/* ----------------------------------------------------------------------- */

static void reduce4 (__m128i H1,__m128i H2,__m128i H3,__m128i H4,
              __m128i X1,__m128i X2,__m128i X3, __m128i X4, __m128i *res)
{
	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
	__m128i H1_X1_lo, H1_X1_hi,
	        H2_X2_lo, H2_X2_hi, 
	        H3_X3_lo, H3_X3_hi, 
	        H4_X4_lo, H4_X4_hi, 
	        lo, hi;
	__m128i tmp0, tmp1, tmp2, tmp3; 
	__m128i tmp4, tmp5, tmp6, tmp7; 
	__m128i tmp8, tmp9;
	
	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 
	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 
	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 
	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
	
	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 
	lo = _mm_xor_si128(lo, H3_X3_lo); 
	lo = _mm_xor_si128(lo, H4_X4_lo);
	
	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 
	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 
	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 
	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
	
	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 
	hi = _mm_xor_si128(hi, H3_X3_hi); 
	hi = _mm_xor_si128(hi, H4_X4_hi);
	
	tmp0 = _mm_shuffle_epi32(H1, 78); 
	tmp4 = _mm_shuffle_epi32(X1, 78); 
	tmp0 = _mm_xor_si128(tmp0, H1); 
	tmp4 = _mm_xor_si128(tmp4, X1); 
	tmp1 = _mm_shuffle_epi32(H2, 78); 
	tmp5 = _mm_shuffle_epi32(X2, 78); 
	tmp1 = _mm_xor_si128(tmp1, H2); 
	tmp5 = _mm_xor_si128(tmp5, X2); 
	tmp2 = _mm_shuffle_epi32(H3, 78); 
	tmp6 = _mm_shuffle_epi32(X3, 78); 
	tmp2 = _mm_xor_si128(tmp2, H3); 
	tmp6 = _mm_xor_si128(tmp6, X3); 
	tmp3 = _mm_shuffle_epi32(H4, 78); 
	tmp7 = _mm_shuffle_epi32(X4, 78); 
	tmp3 = _mm_xor_si128(tmp3, H4); 
	tmp7 = _mm_xor_si128(tmp7, X4);
	
	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 
	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 
	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
	
	tmp0 = _mm_xor_si128(tmp0, lo); 
	tmp0 = _mm_xor_si128(tmp0, hi); 
	tmp0 = _mm_xor_si128(tmp1, tmp0); 
	tmp0 = _mm_xor_si128(tmp2, tmp0); 
	tmp0 = _mm_xor_si128(tmp3, tmp0);
	
	tmp4 = _mm_slli_si128(tmp0, 8); 
	tmp0 = _mm_srli_si128(tmp0, 8);
	
	lo = _mm_xor_si128(tmp4, lo); 
	hi = _mm_xor_si128(tmp0, hi);
	
	tmp3 = lo; 
	tmp6 = hi;
	
	tmp7 = _mm_srli_epi32(tmp3, 31); 
	tmp8 = _mm_srli_epi32(tmp6, 31); 
	tmp3 = _mm_slli_epi32(tmp3, 1); 
	tmp6 = _mm_slli_epi32(tmp6, 1);
	
	tmp9 = _mm_srli_si128(tmp7, 12); 
	tmp8 = _mm_slli_si128(tmp8, 4); 
	tmp7 = _mm_slli_si128(tmp7, 4); 
	tmp3 = _mm_or_si128(tmp3, tmp7); 
	tmp6 = _mm_or_si128(tmp6, tmp8); 
	tmp6 = _mm_or_si128(tmp6, tmp9);
	
	tmp7 = _mm_slli_epi32(tmp3, 31); 
	tmp8 = _mm_slli_epi32(tmp3, 30); 
	tmp9 = _mm_slli_epi32(tmp3, 25);
	
	tmp7 = _mm_xor_si128(tmp7, tmp8); 
	tmp7 = _mm_xor_si128(tmp7, tmp9); 
	tmp8 = _mm_srli_si128(tmp7, 4); 
	tmp7 = _mm_slli_si128(tmp7, 12); 
	tmp3 = _mm_xor_si128(tmp3, tmp7);
	
	tmp2 = _mm_srli_epi32(tmp3, 1); 
	tmp4 = _mm_srli_epi32(tmp3, 2); 
	tmp5 = _mm_srli_epi32(tmp3, 7); 
	tmp2 = _mm_xor_si128(tmp2, tmp4);
	tmp2 = _mm_xor_si128(tmp2, tmp5); 
	tmp2 = _mm_xor_si128(tmp2, tmp8); 
	tmp3 = _mm_xor_si128(tmp3, tmp2); 
	tmp6 = _mm_xor_si128(tmp6, tmp3);
	
	*res = tmp6;
}

static void gfmul (__m128i a, __m128i b, __m128i *res)
{
	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
	tmp4 = _mm_xor_si128(tmp4, tmp5);
	tmp5 = _mm_slli_si128(tmp4, 8);
	tmp4 = _mm_srli_si128(tmp4, 8);
	tmp3 = _mm_xor_si128(tmp3, tmp5);
	tmp6 = _mm_xor_si128(tmp6, tmp4);
	tmp7 = _mm_srli_epi32(tmp3, 31);
	tmp8 = _mm_srli_epi32(tmp6, 31);
	tmp3 = _mm_slli_epi32(tmp3, 1);
	tmp6 = _mm_slli_epi32(tmp6, 1);
	tmp9 = _mm_srli_si128(tmp7, 12);
	tmp8 = _mm_slli_si128(tmp8, 4);
	tmp7 = _mm_slli_si128(tmp7, 4);
	tmp3 = _mm_or_si128(tmp3, tmp7);
	tmp6 = _mm_or_si128(tmp6, tmp8);
	tmp6 = _mm_or_si128(tmp6, tmp9);
	tmp7 = _mm_slli_epi32(tmp3, 31);
	tmp8 = _mm_slli_epi32(tmp3, 30);
	tmp9 = _mm_slli_epi32(tmp3, 25);
	tmp7 = _mm_xor_si128(tmp7, tmp8);
	tmp7 = _mm_xor_si128(tmp7, tmp9);
	tmp8 = _mm_srli_si128(tmp7, 4);
	tmp7 = _mm_slli_si128(tmp7, 12);
	tmp3 = _mm_xor_si128(tmp3, tmp7);
	tmp2 = _mm_srli_epi32(tmp3, 1);
	tmp4 = _mm_srli_epi32(tmp3, 2);
	tmp5 = _mm_srli_epi32(tmp3, 7); 
	tmp2 = _mm_xor_si128(tmp2, tmp4); 
	tmp2 = _mm_xor_si128(tmp2, tmp5); 
	tmp2 = _mm_xor_si128(tmp2, tmp8); 
	tmp3 = _mm_xor_si128(tmp3, tmp2); 
	tmp6 = _mm_xor_si128(tmp6, tmp3);
	*res = tmp6;
}

static void AES_GCM_encrypt(const unsigned char *in,
                     unsigned char *out,
                     const unsigned char* addt,
                     const unsigned char* ivec,
                     unsigned char *tag,
                     int nbytes,
                     int abytes,
                     int ibytes,
                     const unsigned char* key)
{
	int i, j ,k;
	__m128i tmp1, tmp2, tmp3, tmp4;
	__m128i H, H2, H3, H4, Y, T;
	__m128i *KEY = (__m128i*)key;
	__m128i ctr1, ctr2, ctr3, ctr4; 
	__m128i last_block = _mm_setzero_si128(); 
	__m128i ONE = _mm_set_epi32(0, 1, 0, 0); 
	__m128i FOUR = _mm_set_epi32(0, 4, 0, 0); 
	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); 
	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); 
	__m128i X = _mm_setzero_si128();
	
	Y = _mm_load_si128((__m128i*)ivec);
	Y = _mm_insert_epi32(Y, 0x1000000, 3);
	/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
	tmp1 = _mm_xor_si128(X, KEY[0]);
	tmp2 = _mm_xor_si128(Y, KEY[0]);
	for(j=1; j < 10-1; j+=2) {
		tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
		tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
		
		tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
		tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
	}
	tmp1 = _mm_aesenc_si128(tmp1, KEY[10-1]); 
	tmp2 = _mm_aesenc_si128(tmp2, KEY[10-1]);
	
	H = _mm_aesenclast_si128(tmp1, KEY[10]); 
	T = _mm_aesenclast_si128(tmp2, KEY[10]);
	
	H = _mm_shuffle_epi8(H, BSWAP_MASK);

	gfmul(H,H,&H2); 
	gfmul(H,H2,&H3); 
	gfmul(H,H3,&H4);
	
	for(i=0; i<abytes/16/4; i++) { 
		tmp1 = _mm_load_si128(&((__m128i*)addt)[i*4]); 
		tmp2 = _mm_load_si128(&((__m128i*)addt)[i*4+1]); 
		tmp3 = _mm_load_si128(&((__m128i*)addt)[i*4+2]); 
		tmp4 = _mm_load_si128(&((__m128i*)addt)[i*4+3]);
		
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
		
		tmp1 = _mm_xor_si128(X, tmp1);
		
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
	}
	for(i=i*4; i<abytes/16; i++) {
		tmp1 = _mm_load_si128(&((__m128i*)addt)[i]); 
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 
		X = _mm_xor_si128(X,tmp1); 
		gfmul(X, H, &X);
	}
	if(abytes%16) {
		last_block = _mm_setzero_si128();
		for(j=0; j<abytes%16; j++)
			((unsigned char*)&last_block)[j] = addt[i*16+j];
		tmp1 = last_block;
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
		X =_mm_xor_si128(X,tmp1);
		gfmul(X,H,&X);
	}
	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 
	ctr1 = _mm_add_epi64(ctr1, ONE); 
	ctr2 = _mm_add_epi64(ctr1, ONE); 
	ctr3 = _mm_add_epi64(ctr2, ONE);
	ctr4 = _mm_add_epi64(ctr3, ONE);
	
	for(i=0; i<nbytes/16/4; i++){ 
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 
		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 
		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 
		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
		
		ctr1 = _mm_add_epi64(ctr1, FOUR); 
		ctr2 = _mm_add_epi64(ctr2, FOUR); 
		ctr3 = _mm_add_epi64(ctr3, FOUR); 
		ctr4 = _mm_add_epi64(ctr4, FOUR);
		
		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
		tmp3 =_mm_xor_si128(tmp3, KEY[0]); 
		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
		
		for(j=1; j<10-1; j+=2) {
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 
			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 
			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
			
			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 
			tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); 
			tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
		}
		
		tmp1 = _mm_aesenc_si128(tmp1, KEY[10-1]); 
		tmp2 = _mm_aesenc_si128(tmp2, KEY[10-1]); 
		tmp3 = _mm_aesenc_si128(tmp3, KEY[10-1]); 
		tmp4 = _mm_aesenc_si128(tmp4, KEY[10-1]);
		
		tmp1 =_mm_aesenclast_si128(tmp1, KEY[10]);
		tmp2 =_mm_aesenclast_si128(tmp2, KEY[10]); 
		tmp3 =_mm_aesenclast_si128(tmp3, KEY[10]); 
		tmp4 =_mm_aesenclast_si128(tmp4, KEY[10]);
		
		tmp1 = _mm_xor_si128(tmp1, _mm_load_si128(&((__m128i*)in)[i*4+0]));
		tmp2 = _mm_xor_si128(tmp2, _mm_load_si128(&((__m128i*)in)[i*4+1])); 
		tmp3 = _mm_xor_si128(tmp3, _mm_load_si128(&((__m128i*)in)[i*4+2])); 
		tmp4 = _mm_xor_si128(tmp4, _mm_load_si128(&((__m128i*)in)[i*4+3]));
		
		_mm_store_si128(&((__m128i*)out)[i*4+0], tmp1); 
		_mm_store_si128(&((__m128i*)out)[i*4+1], tmp2); 
		_mm_store_si128(&((__m128i*)out)[i*4+2], tmp3); 
		_mm_store_si128(&((__m128i*)out)[i*4+3], tmp4);
		
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 
		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 
		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 
		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);

		tmp1 = _mm_xor_si128(X, tmp1);
		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
	} 
	
	for(k=i*4; k<nbytes/16; k++) {
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
		ctr1 = _mm_add_epi64(ctr1, ONE);
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[10]);

		tmp1 = _mm_xor_si128(tmp1, _mm_load_si128(&((__m128i*)in)[k]));
		_mm_store_si128(&((__m128i*)out)[k], tmp1); 
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 
		X = _mm_xor_si128(X, tmp1); 
		gfmul(X,H,&X);
	}
	//If remains one incomplete block
	if(nbytes%16){
		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 
		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[1]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[2]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[3]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[4]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[5]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[6]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[7]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[8]);
		tmp1 = _mm_aesenc_si128(tmp1, KEY[9]);
		tmp1 = _mm_aesenclast_si128(tmp1, KEY[10]);
		tmp1 = _mm_xor_si128(tmp1, _mm_load_si128(&((__m128i*)in)[k])); 
		last_block = tmp1; 
		#if SAFE_OUTPUT_BUFFERS
		_mm_store_si128(&((__m128i*)out)[k], tmp1); 
		#else
		for(j=0; j<nbytes%16; j++)
			out[k*16+j] = ((unsigned char*)&last_block)[j];
		#endif
		for(j=nbytes%16; j<16; j++)
			((unsigned char*)&last_block)[j] = 0;
		tmp1 = last_block; 
		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 
		X = _mm_xor_si128(X, tmp1); 
		gfmul(X, H, &X);
	}
	tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); 
	tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1);
	X = _mm_xor_si128(X, tmp1); 
	gfmul(X,H,&X); 
	X = _mm_shuffle_epi8(X, BSWAP_MASK); 
	T = _mm_xor_si128(X, T); 
	_mm_store_si128((__m128i*)tag, T);
}

int ae_encrypt(ae_ctx     *ctx,
               const void *nonce,
               const void *pt,
               int         pt_len,
               const void *ad,
               int         ad_len,
               void       *ct,
               void       *tag,
               int         final)
{
	AES_GCM_encrypt(pt, ct, ad, nonce, tag, pt_len, ad_len, 12, ctx->KEY);
	return pt_len;
}

/* ----------------------------------------------------------------------- */
#endif
/* ----------------------------------------------------------------------- */

/* ----------------------------------------------------------------------- */
/* Public functions                                                        */
/* ----------------------------------------------------------------------- */

/* Some systems do not 16-byte-align dynamic allocations involving 16-byte
/  vectors. Adjust the following if your system is one of these            */

/* These determine how to allocate 16-byte aligned vectors, if needed.     */
#define USE_MM_MALLOC ((__SSE2__ || _M_IX86_FP>=2) && !(_M_X64 || __x86_64__))
#define USE_POSIX_MEMALIGN (__ALTIVEC__ && __GLIBC__ && !__PPC64__)

ae_ctx* ae_allocate(void *misc)
{ 
	void *p;
	(void) misc;                     /* misc unused in this implementation */
	#if USE_MM_MALLOC
    	p = _mm_malloc(sizeof(ae_ctx),16); 
	#elif USE_POSIX_MEMALIGN
		if (posix_memalign(&p,16,sizeof(ae_ctx)) != 0) p = NULL;
	#else
		p = malloc(sizeof(ae_ctx)); 
	#endif
	return (ae_ctx *)p;
}

void ae_free(ae_ctx *ctx)
{
	#if USE_MM_MALLOC
		_mm_free(ctx);
	#else
		free(ctx);
	#endif
}

int ae_clear (ae_ctx *ctx) /* Zero ae_ctx and undo initialization          */
{
	memset(ctx, 0, sizeof(ae_ctx));
	return AE_SUCCESS;
}

int ae_ctx_sizeof(void) { return (int) sizeof(ae_ctx); }



#if USE_AES_NI
char infoString[] = "GCM (AES-NI Intel Intrinsics)";  /* Each AE implementation must have a global one */
#elif USE_KASPER_AES
char infoString[] = "GCM (Kasper)";  /* Each AE implementation must have a global one */
#elif USE_OPENSSL_AES_NI
char infoString[] = "GCM (AES-NI OpenSSL)";  /* Each AE implementation must have a global one */
#else
char infoString[] = "GCM-256 (OpenSSL)";  /* Each AE implementation must have a global one */
#endif