Files
cpuminer-opt-gpu/algo/groestl/sse2/grso-asm2.c
2016-09-22 13:16:18 -04:00

1017 lines
49 KiB
C

/* sse4 optimized asm */
/* not really any faster as most of the time is spend loading up a huge table of 1024 ints
* need to write small lanes groestl with sse loads and partial operations
* could be faster for once block if doing partial transforms on a single block
* without lanes transforms function could break after 64bytes is finished
*/
#include "grso-asm.h"
void grsoP1024ASM(u64 *x) {
asm (
"\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15"
"\n movaps 0(%0), %%xmm0"
"\n movhlps %%xmm0, %%xmm1"
"\n movaps 16(%0), %%xmm2"
"\n movhlps %%xmm2, %%xmm3"
"\n movaps 32(%0), %%xmm4"
"\n movhlps %%xmm4, %%xmm5"
"\n movaps 48(%0), %%xmm6"
"\n movhlps %%xmm6, %%xmm7"
"\n movaps 64(%0), %%xmm8"
"\n movhlps %%xmm8, %%xmm9"
"\n movaps 80(%0), %%xmm10"
"\n movhlps %%xmm10, %%xmm11"
"\n movaps 96(%0), %%xmm12"
"\n movhlps %%xmm12, %%xmm13"
"\n movaps 112(%0), %%xmm14"
"\n movhlps %%xmm14, %%xmm15"
"\n xorq %%rbx, %%rbx"
"\n 1: # beginning of the loop"
"\n ### process 1st special pair of input words, words x[2], x[11]"
"\n movq %%xmm2, %%rax"
"\n xorq $0x20, %%rax #xor column dependent constant to x[2]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n movq %%xmm11, %%rcx"
"\n shrq $32, %%rcx #no need add constants to x[11] since it's shifted by 32 bits"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT0(,%%rdx,8), %%mm2"
"\n movq grsoT4(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n movq grsoT1(,%%rdx,8), %%mm1"
"\n movq grsoT5(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT2(,%%rdx,8), %%mm0"
"\n movq grsoT6(,%%rdi,8), %%mm5"
"\n shrq $40,%%rax"
"\n movzbl %%al, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT7(,%%rdx,8), %%mm7"
"\n pxor grsoT7(,%%rdi,8), %%mm0"
"\n ### process the third pair of input words, words x[4], x[9]"
"\n movq %%xmm9, %%rcx"
"\n movq %%xmm4, %%rax"
"\n xorq $0x40, %%rax #xor column dependent constant to x[4]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT0(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n movq grsoT1(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT4(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm3"
"\n ### process 2nd special pair of input words, words x[1], x[12]"
"\n movq %%xmm12, %%rcx"
"\n movq %%xmm1, %%rax"
"\n xorq $0x10, %%rax #xor column dependent constant to x[1]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $40, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT1(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm6"
"\n shrq $56, %%rax"
"\n shrq $16, %%rcx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%al, %%edx"
"\n pxor grsoT7(,%%rdx,8), %%mm6"
"\n pxor grsoT7(,%%rdi,8), %%mm1"
"\n ### process the fourth pair of input words, words x[3], x[10]"
"\n movq %%xmm10, %%rcx"
"\n movq %%xmm3, %%rax"
"\n xorq $0x30, %%rax #xor column dependent constant to x[3]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $24, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n ### process 3rd special pair of input words, words x[0], x[13]"
"\n movq %%xmm13, %%rcx"
"\n movq %%xmm0, %%rax"
"\n xorq %%rbx, %%rax #xor round counter to x[0], column dependent const =0"
"\n shrq $48, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm7"
"\n shrq $48, %%rax"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT7(,%%rdx,8), %%mm5"
"\n pxor grsoT7(,%%rdi,8), %%mm2"
"\n ### process the second pair of input words, words x[5], x[8]"
"\n movq %%xmm8, %%rcx"
"\n movq %%xmm5, %%rax"
"\n xorq $0x50, %%rax #xor column dependent constant to x[5]"
"\n xorq %%rbx, %%rax #xor round counter to x[5]"
"\n shrq $8, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm5"
"\n pxor grsoT1(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT4(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm3"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT5(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n ### process 4th special pair of input words, words x[14], x[15]"
"\n movq %%xmm15, %%rcx"
"\n movq %%xmm14, %%rax"
"\n shrq $56, %%rcx"
"\n shrq $56, %%rax"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT7(,%%rdx,8), %%mm3"
"\n pxor grsoT7(,%%rdi,8), %%mm4"
"\n ### process the first pair of input words, words x[6], x[7]"
"\n movq %%xmm6, %%rax"
"\n movq %%xmm7, %%rcx"
"\n xorq $0x60, %%rax #xor column dependent constant to x[6]"
"\n xorq $0x70, %%rcx #xor column dependent constant to x[7]"
"\n xorq %%rbx, %%rax #xor round counter to x[6]"
"\n xorq %%rbx, %%rcx #xor round counter to x[7]"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm6"
"\n pxor grsoT0(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm5"
"\n pxor grsoT1(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT4(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm3"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT5(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT6(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm1"
"\n ### writes contents of MM0..MM7 to memory "
"\n movq %%mm7, 56(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm3, 24(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm1, 8(%0)"
"\n movq %%mm0, 0(%0)"
"\n #use the remaining data in ah, ch to process"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n movq grsoT7(,%%rdx,8), %%mm3"
"\n movq grsoT7(,%%rdi,8), %%mm4"
"\n ### process the first pair of input words, words x[14], x[15]"
"\n movq %%xmm14, %%rax"
"\n movq %%xmm15, %%rcx"
"\n xorq $0xe0, %%rax #xor column dependent constant to x[14]"
"\n xorq $0xf0, %%rcx #xor column dependent constant to x[15]"
"\n xorq %%rbx, %%rax #xor round counter to x[14]"
"\n xorq %%rbx, %%rcx #xor round counter to x[15]"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT0(,%%rdx,8), %%mm6"
"\n movq grsoT0(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n movq grsoT1(,%%rdx,8), %%mm5"
"\n pxor grsoT1(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT4(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm3"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n movq grsoT5(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n movq grsoT6(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm1"
"\n ### process 3rd special pair of input words, words x[8], x[5]"
"\n movq %%xmm5, %%rcx"
"\n movq %%xmm8, %%rax"
"\n xorq $0x80, %%rax #xor column dependent constant to x[8]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $48, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm7"
"\n shrq $48, %%rax"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT7(,%%rdx,8), %%mm5"
"\n pxor grsoT7(,%%rdi,8), %%mm2"
"\n ### process the second pair of input words, words x[13], x[0]"
"\n movq %%xmm0, %%rcx"
"\n movq %%xmm13, %%rax"
"\n xorq $0xd0, %%rax #xor column dependent constant to x[13]"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $8, %%rcx #no column constant and after shift no round counter either"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm5"
"\n pxor grsoT1(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT4(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm3"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT5(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n ### process the third pair of input words, words x[12], x[1]"
"\n movq %%xmm1, %%rcx"
"\n movq %%xmm12, %%rax"
"\n xorq $0xc0, %%rax #xor column dependent constant to x[12]"
"\n xorq %%rbx, %%rax #xor round counter to x[12]"
"\n shrq $16, %%rcx #constant disappears after shift"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm4"
"\n pxor grsoT2(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT4(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm3"
"\n ### process 2nd special pair of input words, words x[9], x[4]"
"\n movq %%xmm4, %%rcx"
"\n movq %%xmm9, %%rax"
"\n xorq $0x90, %%rax #xor round dependent constant to x[9]"
"\n xorq %%rbx, %%rax #xor round counter to x[9]"
"\n shrq $40, %%rcx #constant disappears after shift in x[4]"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT1(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm6"
"\n shrq $56, %%rax"
"\n shrq $16, %%rcx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%al, %%edx"
"\n pxor grsoT7(,%%rdx,8), %%mm6"
"\n pxor grsoT7(,%%rdi,8), %%mm1"
"\n ### process the fourth pair of input words, words x[11], x[2]"
"\n movq %%xmm2, %%rcx"
"\n movq %%xmm11, %%rax"
"\n xorq $0xb0, %%rax #xor column dependent constant to x[11]"
"\n xorq %%rbx, %%rax #xor round counter to x[11]"
"\n shrq $24, %%rcx #constants disappear after shift in x[2]"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm3"
"\n pxor grsoT3(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm5"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT3(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n ### process 1st special pair of input words, words x[10], x[3]"
"\n movq %%xmm10, %%rax"
"\n movq %%xmm3, %%rcx"
"\n xorq $0xa0, %%rax #xor column dependent constant"
"\n xorq %%rbx, %%rax #xor round counter"
"\n shrq $32, %%rcx #constants disappear after shift"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT0(,%%rdx,8), %%mm2"
"\n pxor grsoT4(,%%rdi,8), %%mm7"
"\n movzbl %%ah, %%edx"
"\n movzbl %%ch, %%edi"
"\n shrq $16, %%rax"
"\n pxor grsoT1(,%%rdx,8), %%mm1"
"\n pxor grsoT5(,%%rdi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%al, %%edx"
"\n movzbl %%cl, %%edi"
"\n pxor grsoT2(,%%rdx,8), %%mm0"
"\n pxor grsoT6(,%%rdi,8), %%mm5"
"\n shrq $40,%%rax"
"\n movzbl %%al, %%edx"
"\n movzbl %%ch, %%edi"
"\n pxor grsoT7(,%%rdx,8), %%mm7"
"\n pxor grsoT7(,%%rdi,8), %%mm0"
"\n incq %%rbx"
"\n cmp $14, %%rbx"
"\n je 2f"
"\n ### move 8 MMX registers to low halves of XMM registers"
"\n movq2dq %%mm0, %%xmm8"
"\n movq2dq %%mm1, %%xmm9"
"\n movq2dq %%mm2, %%xmm10"
"\n movq2dq %%mm3, %%xmm11"
"\n movq2dq %%mm4, %%xmm12"
"\n movq2dq %%mm5, %%xmm13"
"\n movq2dq %%mm6, %%xmm14"
"\n movq2dq %%mm7, %%xmm15"
"\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15"
"\n movaps 0(%0), %%xmm0"
"\n movhlps %%xmm0, %%xmm1"
"\n movaps 16(%0), %%xmm2"
"\n movhlps %%xmm2, %%xmm3"
"\n movaps 32(%0), %%xmm4"
"\n movhlps %%xmm4, %%xmm5"
"\n movaps 48(%0), %%xmm6"
"\n movhlps %%xmm6, %%xmm7"
"\n jmp 1b"
"\n 2: # finalization"
"\n ### writes contents of MM0..MM7 to memory "
"\n movq %%mm7, 120(%0)"
"\n movq %%mm6, 112(%0)"
"\n movq %%mm5, 104(%0)"
"\n movq %%mm4, 96(%0)"
"\n movq %%mm3, 88(%0)"
"\n movq %%mm2, 80(%0)"
"\n movq %%mm1, 72(%0)"
"\n movq %%mm0, 64(%0)"
: /*no output, only memory is modifed */
: "r"(x)
: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" );
}//P1024ASM()
void grsoQ1024ASM(u64 *x) {
asm (
"\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15"
"\n movaps 0(%0), %%xmm0"
"\n movhlps %%xmm0, %%xmm1"
"\n movaps 16(%0), %%xmm2"
"\n movhlps %%xmm2, %%xmm3"
"\n movaps 32(%0), %%xmm4"
"\n movhlps %%xmm4, %%xmm5"
"\n movaps 48(%0), %%xmm6"
"\n movhlps %%xmm6, %%xmm7"
"\n movaps 64(%0), %%xmm8"
"\n movhlps %%xmm8, %%xmm9"
"\n movaps 80(%0), %%xmm10"
"\n movhlps %%xmm10, %%xmm11"
"\n movaps 96(%0), %%xmm12"
"\n movhlps %%xmm12, %%xmm13"
"\n movaps 112(%0), %%xmm14"
"\n movhlps %%xmm14, %%xmm15"
"\n xorl %%ebx, %%ebx"
"\n 1: # beginning of the loop"
"\n ### load a pair of input words x[7], x[8] to process them"
"\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]"
"\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]"
"\n # xor column constants by xoring 0xfff...ff first and later xoring 0xi0 ^ r to bytes that need that"
"\n notq %%rax"
"\n notq %%rcx"
"\n # now we have free register xmm7 which we can use to XOR 0xfff..ff to the remaining ones"
"\n pcmpeqw %%xmm7, %%xmm7 #create mask of all ones in xmm7"
"\n pxor %%xmm7, %%xmm0"
"\n pxor %%xmm7, %%xmm1"
"\n pxor %%xmm7, %%xmm2"
"\n pxor %%xmm7, %%xmm3"
"\n pxor %%xmm7, %%xmm4"
"\n pxor %%xmm7, %%xmm5"
"\n pxor %%xmm7, %%xmm6"
"\n pxor %%xmm7, %%xmm8"
"\n pxor %%xmm7, %%xmm9"
"\n pxor %%xmm7, %%xmm10"
"\n pxor %%xmm7, %%xmm11"
"\n pxor %%xmm7, %%xmm12"
"\n pxor %%xmm7, %%xmm13"
"\n pxor %%xmm7, %%xmm14"
"\n pxor %%xmm7, %%xmm15"
"\n movq %%rax, %%xmm7 #restore orignal value of xmm7 for later"
"\n movzbl %%al, %%edx #edx = x[7].0"
"\n movzbl %%cl, %%edi #edi = x[8].0"
"\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]"
"\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]"
"\n movzbl %%ah, %%edx #edx = x[7].1"
"\n movzbl %%ch, %%edi #edi = x[8].1"
"\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]"
"\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]"
"\n shrq $16, %%rax #rax = [ x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[7].2"
"\n movzbl %%cl, %%edi #edi = x[8].2"
"\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]"
"\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]"
"\n shrq $16, %%rax #rax = [ x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[7].4"
"\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]"
"\n movzbl %%ah, %%edx #edx = x[7].5"
"\n movzbl %%ch, %%edi #edi = x[8].5"
"\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]"
"\n shrq $16, %%rax #rax = [ x[7].6, x[7].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[8].6, x[8].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[7].6"
"\n movzbl %%cl, %%edi #edi = x[8].6"
"\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]"
"\n movzbl %%ah, %%edx #edx = x[7].7"
"\n movzbl %%ch, %%edi #edi = x[8].7"
"\n xorl $0x70, %%edx #xor column dependent part of const"
"\n xorl $0x80, %%edi #xor column dependent part of const"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]"
"\n ### load a pair of input words x[13], x[14] and process them"
"\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]"
"\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]"
"\n shrq $24, %%rax #rax = [ x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[13].3"
"\n movzbl %%cl, %%edi #edi = x[14].3"
"\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]^grsoT3[x[13].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]^grsoT3[x[14].3]"
"\n shrq $32, %%rax #rax = [ x[13].7, 0, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[13].7"
"\n xorl $0xd0, %%edx #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]^grsoT7[x[13].7]"
"\n ### load a pair of input words x[5], x[6] and process them"
"\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]"
"\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]"
"\n movzbl %%al, %%edx #edx = x[5].0"
"\n movzbl %%cl, %%edi #edi = x[6].0"
"\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]"
"\n movzbl %%ah, %%edx #edx = x[5].1"
"\n movzbl %%ch, %%edi #edi = x[6].1"
"\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]"
"\n shrq $16, %%rax #rax = [ x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[5].2"
"\n movzbl %%cl, %%edi #edi = x[6].2"
"\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]"
"\n shrq $16, %%rax #rax = [ x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[5].4"
"\n movzbl %%cl, %%edi #edi = x[6].4"
"\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]"
"\n movzbl %%ah, %%edx #edx = x[5].5"
"\n movzbl %%ch, %%edi #edi = x[6].5"
"\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]"
"\n shrq $16, %%rax #rax = [ x[5].6, x[5].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[6].6, x[6].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[5].6"
"\n movzbl %%cl, %%edi #edi = x[6].6"
"\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]"
"\n movzbl %%ch, %%edi #edi = x[6].7"
"\n xorl $0x60, %%edi #xor column dependent part of const"
"\n xorl %%ebx, %%edi #xor round conter"
"\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]"
"\n ### load a pair of input words x[15], x[0] and process them"
"\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]"
"\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]"
"\n shrq $24, %%rax #rax = [ x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[15].3"
"\n movzbl %%cl, %%edi #edi = x[0].3"
"\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]^grsoT3[x[15].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]^grsoT3[x[0].3]"
"\n movzbl %%ch, %%edi #edi = x[0].4"
"\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]^grsoT4[x[0].4]"
"\n ### load a pair of input words x[3], x[4] and process them"
"\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]"
"\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]"
"\n movzbl %%al, %%edx #edx = x[3].0"
"\n movzbl %%cl, %%edi #edi = x[4].0"
"\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]"
"\n movzbl %%ah, %%edx #edx = x[3].1"
"\n movzbl %%ch, %%edi #edi = x[4].1"
"\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]"
"\n shrq $32, %%rax #rax = [ x[3].4, x[3].5, x[3].6, x[3].7, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[4].4, x[4].5, x[4].6, x[4].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[3].4"
"\n movzbl %%cl, %%edi #edi = x[4].4"
"\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]"
"\n movzbl %%ah, %%edx #edx = x[3].5"
"\n movzbl %%ch, %%edi #edi = x[4].5"
"\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]"
"\n shrq $16, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%cl, %%edi #edi = x[4].6"
"\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]"
"\n ### load a pair of input words x[1], x[2] and process them"
"\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]"
"\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]"
"\n movzbl %%al, %%edx #edx = x[1].0"
"\n movzbl %%cl, %%edi #edi = x[2].0"
"\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]"
"\n shrq $24, %%rax #rax = [ x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[1].3"
"\n movzbl %%cl, %%edi #edi = x[2].3"
"\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]"
"\n movzbl %%ah, %%edx #edx = x[1].4"
"\n movzbl %%ch, %%edi #edi = x[2].4"
"\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]"
"\n shrq $16, %%rcx #rcx = [ x[2].5, x[2].6, x[2].7, 0, 0, 0, 0, 0 ]"
"\n movzbl %%cl, %%edi #edi = x[2].5"
"\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]"
"\n ### load a pair of input words x[9], x[10] and process them"
"\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]"
"\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]"
"\n movzbl %%ah, %%edx #edx = x[9].1"
"\n movzbl %%ch, %%edi #edi = x[10].1"
"\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]"
"\n shrq $16, %%rax #rax = [ x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[9].2"
"\n movzbl %%cl, %%edi #edi = x[10].2"
"\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]"
"\n shrq $24, %%rax #rax = [ x[9].5, x[9].6, x[9].7, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[9].5"
"\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]"
"\n shrq $8, %%rax #rax = [ x[9].6, x[9].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[10].6, x[10].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[9].6"
"\n movzbl %%cl, %%edi #edi = x[10].6"
"\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]"
"\n movzbl %%ah, %%edx #edx = x[9].7"
"\n movzbl %%ch, %%edi #edi = x[10].7"
"\n xorl $0x90, %%edx #xor column constant"
"\n xorl $0xa0, %%edi #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]"
"\n ### load a pair of input words x[11], x[12] and process them"
"\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]"
"\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]"
"\n shrq $16, %%rax #rax = [ x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[11].2"
"\n movzbl %%cl, %%edi #edi = x[12].2"
"\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]"
"\n movzbl %%ah, %%edx #edx = x[11].3"
"\n movzbl %%ch, %%edi #edi = x[12].3"
"\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]^grsoT3[x[12].3]"
"\n shrq $32, %%rax #rax = [ x[11].6, x[11].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[11].6"
"\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]"
"\n movzbl %%ah, %%edx #edx = x[11].7"
"\n movzbl %%ch, %%edi #edi = x[12].7"
"\n xorl $0xb0, %%edx #xor column constant"
"\n xorl $0xc0, %%edi #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]^grsoT7[x[12].7]"
"\n ### writes contents of MM0..MM7 to memory "
"\n movq %%mm0, 0(%0)"
"\n movq %%mm1, 8(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm3, 24(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm7, 56(%0)"
"\n ### load a pair of input words x[15], x[0] and process them"
"\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]"
"\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]"
"\n movzbl %%al, %%edx #edx = x[15].0"
"\n movzbl %%cl, %%edi #edi = x[0].0"
"\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]"
"\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]"
"\n movzbl %%ah, %%edx #edx = x[15].1"
"\n movzbl %%ch, %%edi #edi = x[0].1"
"\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]"
"\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]"
"\n shrq $16, %%rax #rax = [ x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[15].2"
"\n movzbl %%cl, %%edi #edi = x[0].2"
"\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]"
"\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]"
"\n shrq $16, %%rax #rax = [ x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[15].4"
"\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]"
"\n movzbl %%ah, %%edx #edx = x[15].5"
"\n movzbl %%ch, %%edi #edi = x[0].5"
"\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]"
"\n shrq $16, %%rax #rax = [ x[15].6, x[15].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[0].6, x[0].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[15].6"
"\n movzbl %%cl, %%edi #edi = x[0].6"
"\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]"
"\n movzbl %%ah, %%edx #edx = x[15].7"
"\n movzbl %%ch, %%edi #edi = x[0].7"
"\n xorl $0xf0, %%edx #xor column dependent part of const"
"\n xorl $0x00, %%edi #xor column dependent part of const"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]"
"\n ### load a pair of input words x[5], x[6] and process them"
"\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]"
"\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]"
"\n shrq $24, %%rax #rax = [ x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[5].3"
"\n movzbl %%cl, %%edi #edi = x[6].3"
"\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]^grsoT3[x[5].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]^grsoT3[x[6].3]"
"\n shrq $32, %%rax #rax = [ x[5].7, 0, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[5].7"
"\n xorl $0x50, %%edx #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]^grsoT7[x[5].7]"
"\n ### load a pair of input words x[13], x[14] and process them"
"\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]"
"\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]"
"\n movzbl %%al, %%edx #edx = x[13].0"
"\n movzbl %%cl, %%edi #edi = x[14].0"
"\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]"
"\n movzbl %%ah, %%edx #edx = x[13].1"
"\n movzbl %%ch, %%edi #edi = x[14].1"
"\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]"
"\n shrq $16, %%rax #rax = [ x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[13].2"
"\n movzbl %%cl, %%edi #edi = x[14].2"
"\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]"
"\n shrq $16, %%rax #rax = [ x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[13].4"
"\n movzbl %%cl, %%edi #edi = x[14].4"
"\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]"
"\n movzbl %%ah, %%edx #edx = x[13].5"
"\n movzbl %%ch, %%edi #edi = x[14].5"
"\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]"
"\n shrq $16, %%rax #rax = [ x[13].6, x[13].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[14].6, x[14].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[13].6"
"\n movzbl %%cl, %%edi #edi = x[14].6"
"\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]"
"\n movzbl %%ch, %%edi #edi = x[14].7"
"\n xorl $0xe0, %%edi #xor column dependent part of const"
"\n xorl %%ebx, %%edi #xor round conter"
"\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]"
"\n ### load a pair of input words x[7], x[8] and process them"
"\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]"
"\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]"
"\n shrq $24, %%rax #rax = [ x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[7].3"
"\n movzbl %%cl, %%edi #edi = x[8].3"
"\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]^grsoT3[x[7].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]^grsoT3[x[8].3]"
"\n movzbl %%ch, %%edi #edi = x[8].4"
"\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]^grsoT4[x[8].4]"
"\n ### load a pair of input words x[11], x[12] and process them"
"\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]"
"\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]"
"\n movzbl %%al, %%edx #edx = x[11].0"
"\n movzbl %%cl, %%edi #edi = x[12].0"
"\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]"
"\n movzbl %%ah, %%edx #edx = x[11].1"
"\n movzbl %%ch, %%edi #edi = x[12].1"
"\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]"
"\n shrq $32, %%rax #rax = [ x[11].4, x[11].5, x[11].6, x[11].7, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[12].4, x[12].5, x[12].6, x[12].7, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[11].4"
"\n movzbl %%cl, %%edi #edi = x[12].4"
"\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]"
"\n movzbl %%ah, %%edx #edx = x[11].5"
"\n movzbl %%ch, %%edi #edi = x[12].5"
"\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]"
"\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]"
"\n shrq $16, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%cl, %%edi #edi = x[12].6"
"\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]"
"\n ### load a pair of input words x[9], x[10] and process them"
"\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]"
"\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]"
"\n movzbl %%al, %%edx #edx = x[9].0"
"\n movzbl %%cl, %%edi #edi = x[10].0"
"\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]"
"\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]"
"\n shrq $24, %%rax #rax = [ x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0, 0 ]"
"\n shrq $24, %%rcx #rcx = [ x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[9].3"
"\n movzbl %%cl, %%edi #edi = x[10].3"
"\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]"
"\n movzbl %%ah, %%edx #edx = x[9].4"
"\n movzbl %%ch, %%edi #edi = x[10].4"
"\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]"
"\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]"
"\n shrq $16, %%rcx #rcx = [ x[10].5, x[10].6, x[10].7, 0, 0, 0, 0, 0 ]"
"\n movzbl %%cl, %%edi #edi = x[10].5"
"\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]"
"\n ### load a pair of input words x[1], x[2] and process them"
"\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]"
"\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]"
"\n movzbl %%ah, %%edx #edx = x[1].1"
"\n movzbl %%ch, %%edi #edi = x[2].1"
"\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]"
"\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]"
"\n shrq $16, %%rax #rax = [ x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[1].2"
"\n movzbl %%cl, %%edi #edi = x[2].2"
"\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]"
"\n shrq $24, %%rax #rax = [ x[1].5, x[1].6, x[1].7, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[1].5"
"\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]"
"\n shrq $8, %%rax #rax = [ x[1].6, x[1].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[2].6, x[2].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[1].6"
"\n movzbl %%cl, %%edi #edi = x[2].6"
"\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]"
"\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]"
"\n movzbl %%ah, %%edx #edx = x[1].7"
"\n movzbl %%ch, %%edi #edi = x[2].7"
"\n xorl $0x10, %%edx #xor column constant"
"\n xorl $0x20, %%edi #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]"
"\n ### load a pair of input words x[3], x[4] and process them"
"\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]"
"\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]"
"\n shrq $16, %%rax #rax = [ x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7, 0, 0 ]"
"\n shrq $16, %%rcx #rcx = [ x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[3].2"
"\n movzbl %%cl, %%edi #edi = x[4].2"
"\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]"
"\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]"
"\n movzbl %%ah, %%edx #edx = x[3].3"
"\n movzbl %%ch, %%edi #edi = x[4].3"
"\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]"
"\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]^grsoT3[x[4].3]"
"\n shrq $32, %%rax #rax = [ x[3].6, x[3].7, 0, 0, 0, 0, 0, 0 ]"
"\n shrq $32, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]"
"\n movzbl %%al, %%edx #edx = x[3].6"
"\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]"
"\n movzbl %%ah, %%edx #edx = x[3].7"
"\n movzbl %%ch, %%edi #edi = x[4].7"
"\n xorl $0x30, %%edx #xor column constant"
"\n xorl $0x40, %%edi #xor column constant"
"\n xorl %%ebx, %%edx #xor round counter"
"\n xorl %%ebx, %%edi #xor round counter"
"\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]"
"\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]^grsoT7[x[4].7]"
"\n incl %%ebx"
"\n cmp $14, %%ebx"
"\n je 2f"
"\n ### move 8 MMX registers to low halves of XMM registers"
"\n movq2dq %%mm0, %%xmm8"
"\n movq2dq %%mm1, %%xmm9"
"\n movq2dq %%mm2, %%xmm10"
"\n movq2dq %%mm3, %%xmm11"
"\n movq2dq %%mm4, %%xmm12"
"\n movq2dq %%mm5, %%xmm13"
"\n movq2dq %%mm6, %%xmm14"
"\n movq2dq %%mm7, %%xmm15"
"\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15"
"\n movaps 0(%0), %%xmm0"
"\n movhlps %%xmm0, %%xmm1"
"\n movaps 16(%0), %%xmm2"
"\n movhlps %%xmm2, %%xmm3"
"\n movaps 32(%0), %%xmm4"
"\n movhlps %%xmm4, %%xmm5"
"\n movaps 48(%0), %%xmm6"
"\n movhlps %%xmm6, %%xmm7"
"\n jmp 1b"
"\n 2: # finalization"
"\n ### writes contents of MM0..MM7 to memory "
"\n movq %%mm0, 64(%0)"
"\n movq %%mm1, 72(%0)"
"\n movq %%mm2, 80(%0)"
"\n movq %%mm3, 88(%0)"
"\n movq %%mm4, 96(%0)"
"\n movq %%mm5, 104(%0)"
"\n movq %%mm6, 112(%0)"
"\n movq %%mm7, 120(%0)"
: /*no output, only memory is modifed */
: "r"(x)
: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" );
}//Q1024ASM()