Files
cpuminer-opt-gpu/algo/groestl/sse2/grso-asm.c
2016-09-22 13:16:18 -04:00

1064 lines
29 KiB
C

/* mmx optimized asm */
#include "grso-asm.h"
void grsoP1024ASM (u64 *x) {
asm (
"\n movq 8(%0), %%rcx"
"\n movq 24(%0), %%rdx"
"\n movq $0, 8(%0)"
"\n 1:"
"\n movq 0(%0), %%rax"
"\n movq 16(%0), %%rbx"
"\n xorq $0x10, %%rcx"
"\n xorq $0x30, %%rdx"
"\n xorq 8(%0), %%rcx"
"\n xorq 8(%0), %%rdx"
"\n xorq $0x20, %%rbx"
"\n xorq 8(%0), %%rax"
"\n xorq 8(%0), %%rbx"
"\n # processing input words x[1]=rcx and x[3]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT0(,%%rdi,8), %%mm1"
"\n movq grsoT1(,%%rsi,8), %%mm0"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n movq grsoT0(,%%rsi,8), %%mm3"
"\n movq grsoT1(,%%rdi,8), %%mm2"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT2(,%%rdi,8), %%r15"
"\n movq grsoT3(,%%rsi,8), %%r14"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm1"
"\n pxor grsoT3(,%%rdi,8), %%mm0"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT4(,%%rdi,8), %%r13"
"\n movq grsoT5(,%%rsi,8), %%r12"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r15"
"\n xorq grsoT5(,%%rdi,8), %%r14"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT6(,%%rdi,8), %%r11"
"\n movq grsoT7(,%%rsi,8), %%mm6"
"\n movzbl %%dl, %%edi"
"\n movzbl %%dh, %%esi"
"\n xorq grsoT6(,%%rdi,8), %%r13"
"\n movq grsoT7(,%%rsi,8), %%r8"
"\n movq 40(%0), %%rcx"
"\n movq 56(%0), %%rdx"
"\n xorq $0x50, %%rcx"
"\n xorq $0x70, %%rdx"
"\n xorq 8(%0), %%rcx"
"\n xorq 8(%0), %%rdx"
"\n # processing input words x[0]=rax and x[2]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT0(,%%rdi,8), %%mm0"
"\n xorq grsoT1(,%%rsi,8), %%r15"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm2"
"\n pxor grsoT1(,%%rdi,8), %%mm1"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r14"
"\n xorq grsoT3(,%%rsi,8), %%r13"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm0"
"\n xorq grsoT3(,%%rdi,8), %%r15"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r12"
"\n xorq grsoT5(,%%rsi,8), %%r11"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r14"
"\n xorq grsoT5(,%%rdi,8), %%r13"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n movq grsoT6(,%%rdi,8), %%r10"
"\n movq grsoT7(,%%rsi,8), %%mm5"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT6(,%%rsi,8), %%r12"
"\n movq grsoT7(,%%rdi,8), %%mm7"
"\n movq 32(%0), %%rax"
"\n movq 48(%0), %%rbx"
"\n xorq $0x40, %%rax"
"\n xorq $0x60, %%rbx"
"\n xorq 8(%0), %%rax"
"\n xorq 8(%0), %%rbx"
"\n # processing input words x[5]=rcx and x[7]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT0(,%%rdi,8), %%mm5"
"\n movq grsoT1(,%%rsi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm7"
"\n pxor grsoT1(,%%rdi,8), %%mm6"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm3"
"\n pxor grsoT3(,%%rsi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm5"
"\n pxor grsoT3(,%%rdi,8), %%mm4"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm1"
"\n pxor grsoT5(,%%rsi,8), %%mm0"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm3"
"\n pxor grsoT5(,%%rdi,8), %%mm2"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT6(,%%rdi,8), %%r15"
"\n xorq grsoT7(,%%rsi,8), %%r10"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT6(,%%rsi,8), %%mm1"
"\n xorq grsoT7(,%%rdi,8), %%r12"
"\n movq 72(%0), %%rcx"
"\n movq 88(%0), %%rdx"
"\n xorq $0x90, %%rcx"
"\n xorq $0xb0, %%rdx"
"\n xorq 8(%0), %%rcx"
"\n xorq 8(%0), %%rdx"
"\n # processing input words x[4]=rax and x[6]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT0(,%%rdi,8), %%mm4"
"\n pxor grsoT1(,%%rsi,8), %%mm3"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm6"
"\n pxor grsoT1(,%%rdi,8), %%mm5"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm2"
"\n pxor grsoT3(,%%rsi,8), %%mm1"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm4"
"\n pxor grsoT3(,%%rdi,8), %%mm3"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm0"
"\n xorq grsoT5(,%%rsi,8), %%r15"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm2"
"\n pxor grsoT5(,%%rdi,8), %%mm1"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT6(,%%rdi,8), %%r14"
"\n movq grsoT7(,%%rsi,8), %%r9"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT6(,%%rsi,8), %%mm0"
"\n xorq grsoT7(,%%rdi,8), %%r11"
"\n movq 64(%0), %%rax"
"\n movq 80(%0), %%rbx"
"\n xorq $0x80, %%rax"
"\n xorq $0xa0, %%rbx"
"\n xorq 8(%0), %%rax"
"\n xorq 8(%0), %%rbx"
"\n # processing input words x[9]=rcx and x[11]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r9"
"\n xorq grsoT1(,%%rsi,8), %%r8"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r11"
"\n xorq grsoT1(,%%rdi,8), %%r10"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm7"
"\n pxor grsoT3(,%%rsi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r9"
"\n xorq grsoT3(,%%rdi,8), %%r8"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm5"
"\n pxor grsoT5(,%%rsi,8), %%mm4"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm7"
"\n pxor grsoT5(,%%rdi,8), %%mm6"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT6(,%%rdi,8), %%mm3"
"\n xorq grsoT7(,%%rsi,8), %%r14"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT6(,%%rsi,8), %%mm5"
"\n pxor grsoT7(,%%rdi,8), %%mm0"
"\n movq 104(%0), %%rcx"
"\n movq 120(%0), %%rdx"
"\n xorq $0xd0, %%rcx"
"\n xorq $0xf0, %%rdx"
"\n xorq 8(%0), %%rcx"
"\n xorq 8(%0), %%rdx"
"\n # processing input words x[8]=rax and x[10]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r8"
"\n pxor grsoT1(,%%rsi,8), %%mm7"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r10"
"\n xorq grsoT1(,%%rdi,8), %%r9"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm6"
"\n pxor grsoT3(,%%rsi,8), %%mm5"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r8"
"\n pxor grsoT3(,%%rdi,8), %%mm7"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm4"
"\n pxor grsoT5(,%%rsi,8), %%mm3"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm6"
"\n pxor grsoT5(,%%rdi,8), %%mm5"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT6(,%%rdi,8), %%mm2"
"\n xorq grsoT7(,%%rsi,8), %%r13"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT6(,%%rsi,8), %%mm4"
"\n xorq grsoT7(,%%rdi,8), %%r15"
"\n movq 96(%0), %%rax"
"\n movq 112(%0), %%rbx"
"\n xorq $0xc0, %%rax"
"\n xorq $0xe0, %%rbx"
"\n xorq 8(%0), %%rax"
"\n xorq 8(%0), %%rbx"
"\n # processing input words x[13]=rcx and x[15]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r13"
"\n xorq grsoT1(,%%rsi,8), %%r12"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r15"
"\n xorq grsoT1(,%%rdi,8), %%r14"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r11"
"\n xorq grsoT3(,%%rsi,8), %%r10"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r13"
"\n xorq grsoT3(,%%rdi,8), %%r12"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r9"
"\n xorq grsoT5(,%%rsi,8), %%r8"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r11"
"\n xorq grsoT5(,%%rdi,8), %%r10"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT6(,%%rdi,8), %%mm7"
"\n pxor grsoT7(,%%rsi,8), %%mm2"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT6(,%%rsi,8), %%r9"
"\n pxor grsoT7(,%%rdi,8), %%mm4"
"\n # processing input words x[12]=rax and x[14]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r12"
"\n xorq grsoT1(,%%rsi,8), %%r11"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r14"
"\n xorq grsoT1(,%%rdi,8), %%r13"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r10"
"\n xorq grsoT3(,%%rsi,8), %%r9"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r12"
"\n xorq grsoT3(,%%rdi,8), %%r11"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r8"
"\n pxor grsoT5(,%%rsi,8), %%mm7"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r10"
"\n xorq grsoT5(,%%rdi,8), %%r9"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT6(,%%rdi,8), %%mm6"
"\n pxor grsoT7(,%%rsi,8), %%mm1"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT6(,%%rsi,8), %%r8"
"\n pxor grsoT7(,%%rdi,8), %%mm3"
"\n incq 8(%0) #increment counter"
"\n movq 8(%0), %%rdi"
"\n cmp $14, %%edi"
"\n je 2f"
"\n movq %%mm1, %%rcx"
"\n movq %%mm3, %%rdx"
"\n movq %%mm0, 0(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm7, 56(%0)"
"\n movq %%r8 , 64(%0)"
"\n movq %%r9 , 72(%0)"
"\n movq %%r10, 80(%0)"
"\n movq %%r11, 88(%0)"
"\n movq %%r12, 96(%0)"
"\n movq %%r13, 104(%0)"
"\n movq %%r14, 112(%0)"
"\n movq %%r15, 120(%0)"
"\n jmp 1b"
"\n 2:"
"\n movq %%mm0, 0(%0)"
"\n movq %%mm1, 8(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm3, 24(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm7, 56(%0)"
"\n movq %%r8 , 64(%0)"
"\n movq %%r9 , 72(%0)"
"\n movq %%r10, 80(%0)"
"\n movq %%r11, 88(%0)"
"\n movq %%r12, 96(%0)"
"\n movq %%r13, 104(%0)"
"\n movq %%r14, 112(%0)"
"\n movq %%r15, 120(%0)"
: /*no output, only memory is modified */
: "r"(x)
: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" );
}//P512ASM()
void grsoQ1024ASM (u64 *x) {
asm (
"\n movq 8(%0), %%rcx"
"\n movq 24(%0), %%rdx"
"\n movq $0, 8(%0)"
"\n 1:"
"\n movq 0(%0), %%rax"
"\n movq 16(%0), %%rbx"
/* add round constants to columns 0-3 */
"\n movq $0xffffffffffffffff, %%r14"
"\n movq $0xefffffffffffffff, %%r15"
"\n xorq %%r14, %%rax"
"\n xorq %%r15, %%rcx"
"\n movq $0xdfffffffffffffff, %%r14"
"\n movq $0xcfffffffffffffff, %%r15"
"\n xorq %%r14, %%rbx"
"\n xorq %%r15, %%rdx"
"\n # processing input words x[1]=rcx and x[3]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT0(,%%rdi,8), %%mm0"
"\n movq grsoT1(,%%rsi,8), %%r14"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n movq grsoT0(,%%rsi,8), %%mm2"
"\n pxor grsoT1(,%%rdi,8), %%mm0"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT2(,%%rdi,8), %%r12"
"\n movq grsoT3(,%%rsi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r14"
"\n movq grsoT3(,%%rdi,8), %%r8"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT4(,%%rdi,8), %%mm1"
"\n movq grsoT5(,%%rsi,8), %%r15"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n movq grsoT4(,%%rsi,8), %%mm3"
"\n pxor grsoT5(,%%rdi,8), %%mm1"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq 8(%0), %%rsi"
"\n movq grsoT6(,%%rdi,8), %%r13"
"\n movq grsoT7(,%%rsi,8), %%r11"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n xorq grsoT6(,%%rsi,8), %%r15"
"\n xorq grsoT7(,%%rdi,8), %%r13"
"\n # processing input words x[0]=rax and x[2]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r15"
"\n xorq grsoT1(,%%rsi,8), %%r13"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm1"
"\n xorq grsoT1(,%%rdi,8), %%r15"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r11"
"\n movq grsoT3(,%%rsi,8), %%mm5"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r13"
"\n movq grsoT3(,%%rdi,8), %%mm7"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm0"
"\n xorq grsoT5(,%%rsi,8), %%r14"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm2"
"\n pxor grsoT5(,%%rdi,8), %%mm0"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq 8(%0), %%rsi"
"\n xorq grsoT6(,%%rdi,8), %%r12"
"\n movq grsoT7(,%%rsi,8), %%r10"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n xorq grsoT6(,%%rsi,8), %%r14"
"\n xorq grsoT7(,%%rdi,8), %%r12"
/* read columns 4-7 from registers and add round constants to these */
"\n movq %%r14, 128(%0)"
"\n movq %%r15, 136(%0)"
"\n movq 32(%0), %%rax" /* read input column 4 */
"\n movq 40(%0), %%rcx" /* read input column 5 */
"\n movq 48(%0), %%rbx" /* read input column 6 */
"\n movq 56(%0), %%rdx" /* read input column 7 */
"\n movq $0xbfffffffffffffff, %%r14"
"\n movq $0xafffffffffffffff, %%r15"
"\n xorq %%r14, %%rax"
"\n xorq %%r15, %%rcx"
"\n movq $0x9fffffffffffffff, %%r14"
"\n movq $0x8fffffffffffffff, %%r15"
"\n xorq %%r14, %%rbx"
"\n xorq %%r15, %%rdx"
"\n movq 128(%0), %%r14"
"\n movq 136(%0), %%r15"
"\n # processing input words x[5]=rcx and x[7]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n movq grsoT0(,%%rdi,8), %%mm4"
"\n pxor grsoT1(,%%rsi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm6"
"\n pxor grsoT1(,%%rdi,8), %%mm4"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm0"
"\n xorq grsoT3(,%%rsi,8), %%r10"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm2"
"\n xorq grsoT3(,%%rdi,8), %%r12"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm5"
"\n pxor grsoT5(,%%rsi,8), %%mm3"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm7"
"\n pxor grsoT5(,%%rdi,8), %%mm5"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq 8(%0), %%rsi"
"\n pxor grsoT6(,%%rdi,8), %%mm1"
"\n xorq grsoT7(,%%rsi,8), %%r15"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n pxor grsoT6(,%%rsi,8), %%mm3"
"\n pxor grsoT7(,%%rdi,8), %%mm1"
"\n # processing input words x[4]=rax and x[6]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT0(,%%rdi,8), %%mm3"
"\n pxor grsoT1(,%%rsi,8), %%mm1"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT0(,%%rsi,8), %%mm5"
"\n pxor grsoT1(,%%rdi,8), %%mm3"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r15"
"\n movq grsoT3(,%%rsi,8), %%r9"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm1"
"\n xorq grsoT3(,%%rdi,8), %%r11"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT4(,%%rdi,8), %%mm4"
"\n pxor grsoT5(,%%rsi,8), %%mm2"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT4(,%%rsi,8), %%mm6"
"\n pxor grsoT5(,%%rdi,8), %%mm4"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq 8(%0), %%rsi"
"\n pxor grsoT6(,%%rdi,8), %%mm0"
"\n xorq grsoT7(,%%rsi,8), %%r14"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n pxor grsoT6(,%%rsi,8), %%mm2"
"\n pxor grsoT7(,%%rdi,8), %%mm0"
/* read columns 8-11 from registers and add round constants to these */
"\n movq %%r14, 128(%0)"
"\n movq %%r15, 136(%0)"
"\n movq 64(%0), %%rax" /* read input column 8 */
"\n movq 72(%0), %%rcx" /* read input column 9 */
"\n movq 80(%0), %%rbx" /* read input column 10 */
"\n movq 88(%0), %%rdx" /* read input column 11 */
"\n movq $0x7fffffffffffffff, %%r14"
"\n movq $0x6fffffffffffffff, %%r15"
"\n xorq %%r14, %%rax"
"\n xorq %%r15, %%rcx"
"\n movq $0x5fffffffffffffff, %%r14"
"\n movq $0x4fffffffffffffff, %%r15"
"\n xorq %%r14, %%rbx"
"\n xorq %%r15, %%rdx"
"\n movq 128(%0), %%r14"
"\n movq 136(%0), %%r15"
"\n # processing input words x[9]=rcx and x[11]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r8"
"\n pxor grsoT1(,%%rsi,8), %%mm6"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r10"
"\n xorq grsoT1(,%%rdi,8), %%r8"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm4"
"\n xorq grsoT3(,%%rsi,8), %%r14"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm6"
"\n pxor grsoT3(,%%rdi,8), %%mm0"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r9"
"\n pxor grsoT5(,%%rsi,8), %%mm7"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r11"
"\n xorq grsoT5(,%%rdi,8), %%r9"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq 8(%0), %%rsi"
"\n pxor grsoT6(,%%rdi,8), %%mm5"
"\n pxor grsoT7(,%%rsi,8), %%mm3"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n pxor grsoT6(,%%rsi,8), %%mm7"
"\n pxor grsoT7(,%%rdi,8), %%mm5"
"\n # processing input words x[8]=rax and x[10]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT0(,%%rdi,8), %%mm7"
"\n pxor grsoT1(,%%rsi,8), %%mm5"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r9"
"\n pxor grsoT1(,%%rdi,8), %%mm7"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm3"
"\n xorq grsoT3(,%%rsi,8), %%r13"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n pxor grsoT2(,%%rsi,8), %%mm5"
"\n xorq grsoT3(,%%rdi,8), %%r15"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r8"
"\n pxor grsoT5(,%%rsi,8), %%mm6"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r10"
"\n xorq grsoT5(,%%rdi,8), %%r8"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq 8(%0), %%rsi"
"\n pxor grsoT6(,%%rdi,8), %%mm4"
"\n pxor grsoT7(,%%rsi,8), %%mm2"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n pxor grsoT6(,%%rsi,8), %%mm6"
"\n pxor grsoT7(,%%rdi,8), %%mm4"
/* read columns 12-15 from registers and add round constants to these */
"\n movq %%r14, 128(%0)"
"\n movq %%r15, 136(%0)"
"\n movq 96(%0), %%rax" /* read input column 12 */
"\n movq 104(%0), %%rcx" /* read input column 13 */
"\n movq 112(%0), %%rbx" /* read input column 14 */
"\n movq 120(%0), %%rdx" /* read input column 15 */
"\n movq $0x3fffffffffffffff, %%r14"
"\n movq $0x2fffffffffffffff, %%r15"
"\n xorq %%r14, %%rax"
"\n xorq %%r15, %%rcx"
"\n movq $0x1fffffffffffffff, %%r14"
"\n movq $0x0fffffffffffffff, %%r15"
"\n xorq %%r14, %%rbx"
"\n xorq %%r15, %%rdx"
"\n movq 128(%0), %%r14"
"\n movq 136(%0), %%r15"
"\n # processing input words x[13]=rcx and x[15]=rdx "
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r12"
"\n xorq grsoT1(,%%rsi,8), %%r10"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r14"
"\n xorq grsoT1(,%%rdi,8), %%r12"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT2(,%%rdi,8), %%r8"
"\n pxor grsoT3(,%%rsi,8), %%mm2"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r10"
"\n pxor grsoT3(,%%rdi,8), %%mm4"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r13"
"\n xorq grsoT5(,%%rsi,8), %%r11"
"\n shrq $16, %%rcx"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r15"
"\n xorq grsoT5(,%%rdi,8), %%r13"
"\n shrq $16, %%rdx"
"\n movzbl %%cl, %%edi"
"\n movzbl %%ch, %%esi"
"\n xorq 8(%0), %%rsi"
"\n xorq grsoT6(,%%rdi,8), %%r9"
"\n pxor grsoT7(,%%rsi,8), %%mm7"
"\n movzbl %%dl, %%esi"
"\n movzbl %%dh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n xorq grsoT6(,%%rsi,8), %%r11"
"\n xorq grsoT7(,%%rdi,8), %%r9"
"\n # processing input words x[12]=rax and x[14]=rbx "
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT0(,%%rdi,8), %%r11"
"\n xorq grsoT1(,%%rsi,8), %%r9"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT0(,%%rsi,8), %%r13"
"\n xorq grsoT1(,%%rdi,8), %%r11"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n pxor grsoT2(,%%rdi,8), %%mm7"
"\n pxor grsoT3(,%%rsi,8), %%mm1"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT2(,%%rsi,8), %%r9"
"\n pxor grsoT3(,%%rdi,8), %%mm3"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq grsoT4(,%%rdi,8), %%r12"
"\n xorq grsoT5(,%%rsi,8), %%r10"
"\n shrq $16, %%rax"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq grsoT4(,%%rsi,8), %%r14"
"\n xorq grsoT5(,%%rdi,8), %%r12"
"\n shrq $16, %%rbx"
"\n movzbl %%al, %%edi"
"\n movzbl %%ah, %%esi"
"\n xorq 8(%0), %%rsi"
"\n xorq grsoT6(,%%rdi,8), %%r8"
"\n pxor grsoT7(,%%rsi,8), %%mm6"
"\n movzbl %%bl, %%esi"
"\n movzbl %%bh, %%edi"
"\n xorq 8(%0), %%rdi"
"\n xorq grsoT6(,%%rsi,8), %%r10"
"\n xorq grsoT7(,%%rdi,8), %%r8"
"\n incq 8(%0) #increment counter"
"\n movq 8(%0), %%rdi"
"\n cmp $14, %%edi"
"\n je 2f"
"\n movq %%mm1, %%rcx"
"\n movq %%mm3, %%rdx"
"\n movq %%mm0, 0(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm7, 56(%0)"
"\n movq %%r8 , 64(%0)"
"\n movq %%r9 , 72(%0)"
"\n movq %%r10, 80(%0)"
"\n movq %%r11, 88(%0)"
"\n movq %%r12, 96(%0)"
"\n movq %%r13, 104(%0)"
"\n movq %%r14, 112(%0)"
"\n movq %%r15, 120(%0)"
"\n jmp 1b"
"\n 2:"
"\n movq %%mm0, 0(%0)"
"\n movq %%mm1, 8(%0)"
"\n movq %%mm2, 16(%0)"
"\n movq %%mm3, 24(%0)"
"\n movq %%mm4, 32(%0)"
"\n movq %%mm5, 40(%0)"
"\n movq %%mm6, 48(%0)"
"\n movq %%mm7, 56(%0)"
"\n movq %%r8 , 64(%0)"
"\n movq %%r9 , 72(%0)"
"\n movq %%r10, 80(%0)"
"\n movq %%r11, 88(%0)"
"\n movq %%r12, 96(%0)"
"\n movq %%r13, 104(%0)"
"\n movq %%r14, 112(%0)"
"\n movq %%r15, 120(%0)"
: /*no output, only memory is modified */
: "r"(x)
: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" );
}//Q512ASM()