/* * Copyright (c) 2014 John Doering * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef _MSC_VER /* arch defines */ #include "miner.h" #endif #if defined(__GNUC__) && !defined(__arm__) #define ASM 1 /* #define WIN64 0 */ #endif #if (ASM) && (__x86_64__) /* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy(); * len must be a multiple of 64 bytes aligned properly */ .globl neoscrypt_blkcpy_sse2 neoscrypt_blkcpy_sse2: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif xorq %rcx, %rcx movl %edx, %ecx shrl $6, %ecx movq $64, %rax .blkcpy: movdqa 0(%rsi), %xmm0 movdqa 16(%rsi), %xmm1 movdqa 32(%rsi), %xmm2 movdqa 48(%rsi), %xmm3 movdqa %xmm0, 0(%rdi) movdqa %xmm1, 16(%rdi) movdqa %xmm2, 32(%rdi) movdqa %xmm3, 48(%rdi) addq %rax, %rdi addq %rax, %rsi decl %ecx jnz .blkcpy #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret /* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper; * len must be a multiple of 64 bytes aligned properly */ .globl neoscrypt_blkswp_sse2 neoscrypt_blkswp_sse2: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif xorq %rcx, %rcx movl %edx, %ecx shrl $6, %ecx movq $64, %rax .blkswp: movdqa 0(%rdi), %xmm0 movdqa 16(%rdi), %xmm1 movdqa 32(%rdi), %xmm2 movdqa 48(%rdi), %xmm3 movdqa 0(%rsi), %xmm4 movdqa 16(%rsi), %xmm5 movdqa 32(%rsi), %xmm8 movdqa 48(%rsi), %xmm9 movdqa %xmm0, 0(%rsi) movdqa %xmm1, 16(%rsi) movdqa %xmm2, 32(%rsi) movdqa %xmm3, 48(%rsi) movdqa %xmm4, 0(%rdi) movdqa %xmm5, 16(%rdi) movdqa %xmm8, 32(%rdi) movdqa %xmm9, 48(%rdi) addq %rax, %rdi addq %rax, %rsi decl %ecx jnz .blkswp #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret /* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine; * len must be a multiple of 64 bytes aligned properly */ .globl neoscrypt_blkxor_sse2 neoscrypt_blkxor_sse2: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif xorq %rcx, %rcx movl %edx, %ecx shrl $6, %ecx movq $64, %rax .blkxor: movdqa 0(%rdi), %xmm0 movdqa 16(%rdi), %xmm1 movdqa 32(%rdi), %xmm2 movdqa 48(%rdi), %xmm3 movdqa 0(%rsi), %xmm4 movdqa 16(%rsi), %xmm5 movdqa 32(%rsi), %xmm8 movdqa 48(%rsi), %xmm9 pxor %xmm4, %xmm0 pxor %xmm5, %xmm1 pxor %xmm8, %xmm2 pxor %xmm9, %xmm3 movdqa %xmm0, 0(%rdi) movdqa %xmm1, 16(%rdi) movdqa %xmm2, 32(%rdi) movdqa %xmm3, 48(%rdi) addq %rax, %rdi addq %rax, %rsi decl %ecx jnz .blkxor #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret /* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20; * mem must be aligned properly, rounds must be a multiple of 2 */ .globl neoscrypt_salsa neoscrypt_salsa: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi #endif xorq %rcx, %rcx movl %esi, %ecx shrl $1, %ecx movdqa 0(%rdi), %xmm0 movdqa %xmm0, %xmm12 movdqa 16(%rdi), %xmm1 movdqa %xmm1, %xmm13 movdqa 32(%rdi), %xmm2 movdqa %xmm2, %xmm14 movdqa 48(%rdi), %xmm3 movdqa %xmm3, %xmm15 .salsa: movdqa %xmm1, %xmm4 paddd %xmm0, %xmm4 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm3 movdqa %xmm0, %xmm4 pxor %xmm5, %xmm3 paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm3, %xmm4 pxor %xmm5, %xmm2 pshufd $0x93, %xmm3, %xmm3 paddd %xmm2, %xmm4 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm1 movdqa %xmm2, %xmm4 pxor %xmm5, %xmm1 pshufd $0x4E, %xmm2, %xmm2 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 movdqa %xmm3, %xmm4 pxor %xmm5, %xmm0 pshufd $0x39, %xmm1, %xmm1 paddd %xmm0, %xmm4 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm1 movdqa %xmm0, %xmm4 pxor %xmm5, %xmm1 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm1, %xmm4 pxor %xmm5, %xmm2 pshufd $0x93, %xmm1, %xmm1 paddd %xmm2, %xmm4 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm3 movdqa %xmm2, %xmm4 pxor %xmm5, %xmm3 pshufd $0x4E, %xmm2, %xmm2 paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm3, %xmm3 pxor %xmm5, %xmm0 decl %ecx jnz .salsa paddd %xmm12, %xmm0 movdqa %xmm0, 0(%rdi) paddd %xmm13, %xmm1 movdqa %xmm1, 16(%rdi) paddd %xmm14, %xmm2 movdqa %xmm2, 32(%rdi) paddd %xmm15, %xmm3 movdqa %xmm3, 48(%rdi) #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret /* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher; * correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */ .globl neoscrypt_salsa_tangle neoscrypt_salsa_tangle: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi #endif xorq %rcx, %rcx movl %esi, %ecx movq $64, %r8 .salsa_tangle: movl 4(%rdi), %eax movl 20(%rdi), %edx movl %eax, 20(%rdi) movl %edx, 4(%rdi) movl 8(%rdi), %eax movl 40(%rdi), %edx movl %eax, 40(%rdi) movl %edx, 8(%rdi) movl 12(%rdi), %eax movl 60(%rdi), %edx movl %eax, 60(%rdi) movl %edx, 12(%rdi) movl 16(%rdi), %eax movl 48(%rdi), %edx movl %eax, 48(%rdi) movl %edx, 16(%rdi) movl 28(%rdi), %eax movl 44(%rdi), %edx movl %eax, 44(%rdi) movl %edx, 28(%rdi) movl 36(%rdi), %eax movl 52(%rdi), %edx movl %eax, 52(%rdi) movl %edx, 36(%rdi) addq %r8, %rdi decl %ecx jnz .salsa_tangle #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret /* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20; * mem must be aligned properly, rounds must be a multiple of 2 */ .globl neoscrypt_chacha neoscrypt_chacha: #if (WIN64) movq %rdi, %r10 movq %rsi, %r11 movq %rcx, %rdi movq %rdx, %rsi #endif xorq %rcx, %rcx movl %esi, %ecx shrl $1, %ecx movdqa 0(%rdi), %xmm0 movdqa %xmm0, %xmm12 movdqa 16(%rdi), %xmm1 movdqa %xmm1, %xmm13 movdqa 32(%rdi), %xmm2 movdqa %xmm2, %xmm14 movdqa 48(%rdi), %xmm3 movdqa %xmm3, %xmm15 .chacha: paddd %xmm1, %xmm0 pxor %xmm0, %xmm3 pshuflw $0xB1, %xmm3, %xmm3 pshufhw $0xB1, %xmm3, %xmm3 paddd %xmm3, %xmm2 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm4 pslld $12, %xmm1 psrld $20, %xmm4 pxor %xmm4, %xmm1 paddd %xmm1, %xmm0 pxor %xmm0, %xmm3 movdqa %xmm3, %xmm4 pslld $8, %xmm3 psrld $24, %xmm4 pxor %xmm4, %xmm3 pshufd $0x93, %xmm0, %xmm0 paddd %xmm3, %xmm2 pshufd $0x4E, %xmm3, %xmm3 pxor %xmm2, %xmm1 pshufd $0x39, %xmm2, %xmm2 movdqa %xmm1, %xmm4 pslld $7, %xmm1 psrld $25, %xmm4 pxor %xmm4, %xmm1 paddd %xmm1, %xmm0 pxor %xmm0, %xmm3 pshuflw $0xB1, %xmm3, %xmm3 pshufhw $0xB1, %xmm3, %xmm3 paddd %xmm3, %xmm2 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm4 pslld $12, %xmm1 psrld $20, %xmm4 pxor %xmm4, %xmm1 paddd %xmm1, %xmm0 pxor %xmm0, %xmm3 movdqa %xmm3, %xmm4 pslld $8, %xmm3 psrld $24, %xmm4 pxor %xmm4, %xmm3 pshufd $0x39, %xmm0, %xmm0 paddd %xmm3, %xmm2 pshufd $0x4E, %xmm3, %xmm3 pxor %xmm2, %xmm1 pshufd $0x93, %xmm2, %xmm2 movdqa %xmm1, %xmm4 pslld $7, %xmm1 psrld $25, %xmm4 pxor %xmm4, %xmm1 decl %ecx jnz .chacha paddd %xmm12, %xmm0 movdqa %xmm0, 0(%rdi) paddd %xmm13, %xmm1 movdqa %xmm1, 16(%rdi) paddd %xmm14, %xmm2 movdqa %xmm2, 32(%rdi) paddd %xmm15, %xmm3 movdqa %xmm3, 48(%rdi) #if (WIN64) movq %r10, %rdi movq %r11, %rsi #endif ret #endif /* (ASM) && (__x86_64__) */