/* salsa20-amd64.S  -  AMD64 implementation of Salsa20
 *
 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This file is part of Libgcrypt.
 *
 * Libgcrypt is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * Libgcrypt is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

/*
 * Based on public domain implementation by D. J. Bernstein at
 *  http://cr.yp.to/snuffle.html
 */

#ifdef __x86_64
#include <config.h>
#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)

#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
# define ELF(...) __VA_ARGS__
#else
# define ELF(...) /*_*/
#endif

.text

.align 8
.globl _gcry_salsa20_amd64_keysetup
ELF(.type  _gcry_salsa20_amd64_keysetup,@function;)
_gcry_salsa20_amd64_keysetup:
	movl   0(%rsi),%r8d
	movl   4(%rsi),%r9d
	movl   8(%rsi),%eax
	movl   12(%rsi),%r10d
	movl   %r8d,20(%rdi)
	movl   %r9d,40(%rdi)
	movl   %eax,60(%rdi)
	movl   %r10d,48(%rdi)
	cmp  $256,%rdx
	jb .L_kbits128
.L_kbits256:
	movl   16(%rsi),%edx
	movl   20(%rsi),%ecx
	movl   24(%rsi),%r8d
	movl   28(%rsi),%esi
	movl   %edx,28(%rdi)
	movl   %ecx,16(%rdi)
	movl   %r8d,36(%rdi)
	movl   %esi,56(%rdi)
	mov  $1634760805,%rsi
	mov  $857760878,%rdx
	mov  $2036477234,%rcx
	mov  $1797285236,%r8
	movl   %esi,0(%rdi)
	movl   %edx,4(%rdi)
	movl   %ecx,8(%rdi)
	movl   %r8d,12(%rdi)
	jmp .L_keysetupdone
.L_kbits128:
	movl   0(%rsi),%edx
	movl   4(%rsi),%ecx
	movl   8(%rsi),%r8d
	movl   12(%rsi),%esi
	movl   %edx,28(%rdi)
	movl   %ecx,16(%rdi)
	movl   %r8d,36(%rdi)
	movl   %esi,56(%rdi)
	mov  $1634760805,%rsi
	mov  $824206446,%rdx
	mov  $2036477238,%rcx
	mov  $1797285236,%r8
	movl   %esi,0(%rdi)
	movl   %edx,4(%rdi)
	movl   %ecx,8(%rdi)
	movl   %r8d,12(%rdi)
.L_keysetupdone:
	ret

.align 8
.globl _gcry_salsa20_amd64_ivsetup
ELF(.type  _gcry_salsa20_amd64_ivsetup,@function;)
_gcry_salsa20_amd64_ivsetup:
	movl   0(%rsi),%r8d
	movl   4(%rsi),%esi
	mov  $0,%r9
	mov  $0,%rax
	movl   %r8d,24(%rdi)
	movl   %esi,44(%rdi)
	movl   %r9d,32(%rdi)
	movl   %eax,52(%rdi)
	ret

.align 8
.globl _gcry_salsa20_amd64_encrypt_blocks
ELF(.type  _gcry_salsa20_amd64_encrypt_blocks,@function;)
_gcry_salsa20_amd64_encrypt_blocks:
	/*
	 * Modifications to original implementation:
	 *  - Number of rounds passing in register %r8 (for Salsa20/12).
	 *  - Length is input as number of blocks, so don't handle tail bytes
	 *    (this is done in salsa20.c).
	 */
	push %rbx
	shlq $6, %rcx /* blocks to bytes */
	mov %r8, %rbx
	mov %rsp,%r11
	and $31,%r11
	add $384,%r11
	sub %r11,%rsp
	mov  %rdi,%r8
	mov  %rsi,%rsi
	mov  %rdx,%rdi
	mov  %rcx,%rdx
	cmp  $0,%rdx
	jbe .L_done
.L_start:
	cmp  $256,%rdx
	jb .L_bytes_are_64_128_or_192
	movdqa 0(%r8),%xmm0
	pshufd $0x55,%xmm0,%xmm1
	pshufd $0xaa,%xmm0,%xmm2
	pshufd $0xff,%xmm0,%xmm3
	pshufd $0x00,%xmm0,%xmm0
	movdqa %xmm1,0(%rsp)
	movdqa %xmm2,16(%rsp)
	movdqa %xmm3,32(%rsp)
	movdqa %xmm0,48(%rsp)
	movdqa 16(%r8),%xmm0
	pshufd $0xaa,%xmm0,%xmm1
	pshufd $0xff,%xmm0,%xmm2
	pshufd $0x00,%xmm0,%xmm3
	pshufd $0x55,%xmm0,%xmm0
	movdqa %xmm1,64(%rsp)
	movdqa %xmm2,80(%rsp)
	movdqa %xmm3,96(%rsp)
	movdqa %xmm0,112(%rsp)
	movdqa 32(%r8),%xmm0
	pshufd $0xff,%xmm0,%xmm1
	pshufd $0x55,%xmm0,%xmm2
	pshufd $0xaa,%xmm0,%xmm0
	movdqa %xmm1,128(%rsp)
	movdqa %xmm2,144(%rsp)
	movdqa %xmm0,160(%rsp)
	movdqa 48(%r8),%xmm0
	pshufd $0x00,%xmm0,%xmm1
	pshufd $0xaa,%xmm0,%xmm2
	pshufd $0xff,%xmm0,%xmm0
	movdqa %xmm1,176(%rsp)
	movdqa %xmm2,192(%rsp)
	movdqa %xmm0,208(%rsp)
.L_bytesatleast256:
	movl   32(%r8),%ecx
	movl   52(%r8),%r9d
	movl %ecx,224(%rsp)
	movl %r9d,240(%rsp)
	add  $1,%ecx
	adc  $0,%r9d
	movl %ecx,4+224(%rsp)
	movl %r9d,4+240(%rsp)
	add  $1,%ecx
	adc  $0,%r9d
	movl %ecx,8+224(%rsp)
	movl %r9d,8+240(%rsp)
	add  $1,%ecx
	adc  $0,%r9d
	movl %ecx,12+224(%rsp)
	movl %r9d,12+240(%rsp)
	add  $1,%ecx
	adc  $0,%r9d
	movl   %ecx,32(%r8)
	movl   %r9d,52(%r8)
	movq %rdx,288(%rsp)
	mov  %rbx,%rdx
	movdqa 0(%rsp),%xmm0
	movdqa 16(%rsp),%xmm1
	movdqa 32(%rsp),%xmm2
	movdqa 192(%rsp),%xmm3
	movdqa 208(%rsp),%xmm4
	movdqa 64(%rsp),%xmm5
	movdqa 80(%rsp),%xmm6
	movdqa 112(%rsp),%xmm7
	movdqa 128(%rsp),%xmm8
	movdqa 144(%rsp),%xmm9
	movdqa 160(%rsp),%xmm10
	movdqa 240(%rsp),%xmm11
	movdqa 48(%rsp),%xmm12
	movdqa 96(%rsp),%xmm13
	movdqa 176(%rsp),%xmm14
	movdqa 224(%rsp),%xmm15
.L_mainloop1:
	movdqa %xmm1,256(%rsp)
	movdqa %xmm2,272(%rsp)
	movdqa %xmm13,%xmm1
	paddd %xmm12,%xmm1
	movdqa %xmm1,%xmm2
	pslld $7,%xmm1
	pxor  %xmm1,%xmm14
	psrld $25,%xmm2
	pxor  %xmm2,%xmm14
	movdqa %xmm7,%xmm1
	paddd %xmm0,%xmm1
	movdqa %xmm1,%xmm2
	pslld $7,%xmm1
	pxor  %xmm1,%xmm11
	psrld $25,%xmm2
	pxor  %xmm2,%xmm11
	movdqa %xmm12,%xmm1
	paddd %xmm14,%xmm1
	movdqa %xmm1,%xmm2
	pslld $9,%xmm1
	pxor  %xmm1,%xmm15
	psrld $23,%xmm2
	pxor  %xmm2,%xmm15
	movdqa %xmm0,%xmm1
	paddd %xmm11,%xmm1
	movdqa %xmm1,%xmm2
	pslld $9,%xmm1
	pxor  %xmm1,%xmm9
	psrld $23,%xmm2
	pxor  %xmm2,%xmm9
	movdqa %xmm14,%xmm1
	paddd %xmm15,%xmm1
	movdqa %xmm1,%xmm2
	pslld $13,%xmm1
	pxor  %xmm1,%xmm13
	psrld $19,%xmm2
	pxor  %xmm2,%xmm13
	movdqa %xmm11,%xmm1
	paddd %xmm9,%xmm1
	movdqa %xmm1,%xmm2
	pslld $13,%xmm1
	pxor  %xmm1,%xmm7
	psrld $19,%xmm2
	pxor  %xmm2,%xmm7
	movdqa %xmm15,%xmm1
	paddd %xmm13,%xmm1
	movdqa %xmm1,%xmm2
	pslld $18,%xmm1
	pxor  %xmm1,%xmm12
	psrld $14,%xmm2
	pxor  %xmm2,%xmm12
	movdqa 256(%rsp),%xmm1
	movdqa %xmm12,256(%rsp)
	movdqa %xmm9,%xmm2
	paddd %xmm7,%xmm2
	movdqa %xmm2,%xmm12
	pslld $18,%xmm2
	pxor  %xmm2,%xmm0
	psrld $14,%xmm12
	pxor  %xmm12,%xmm0
	movdqa %xmm5,%xmm2
	paddd %xmm1,%xmm2
	movdqa %xmm2,%xmm12
	pslld $7,%xmm2
	pxor  %xmm2,%xmm3
	psrld $25,%xmm12
	pxor  %xmm12,%xmm3
	movdqa 272(%rsp),%xmm2
	movdqa %xmm0,272(%rsp)
	movdqa %xmm6,%xmm0
	paddd %xmm2,%xmm0
	movdqa %xmm0,%xmm12
	pslld $7,%xmm0
	pxor  %xmm0,%xmm4
	psrld $25,%xmm12
	pxor  %xmm12,%xmm4
	movdqa %xmm1,%xmm0
	paddd %xmm3,%xmm0
	movdqa %xmm0,%xmm12
	pslld $9,%xmm0
	pxor  %xmm0,%xmm10
	psrld $23,%xmm12
	pxor  %xmm12,%xmm10
	movdqa %xmm2,%xmm0
	paddd %xmm4,%xmm0
	movdqa %xmm0,%xmm12
	pslld $9,%xmm0
	pxor  %xmm0,%xmm8
	psrld $23,%xmm12
	pxor  %xmm12,%xmm8
	movdqa %xmm3,%xmm0
	paddd %xmm10,%xmm0
	movdqa %xmm0,%xmm12
	pslld $13,%xmm0
	pxor  %xmm0,%xmm5
	psrld $19,%xmm12
	pxor  %xmm12,%xmm5
	movdqa %xmm4,%xmm0
	paddd %xmm8,%xmm0
	movdqa %xmm0,%xmm12
	pslld $13,%xmm0
	pxor  %xmm0,%xmm6
	psrld $19,%xmm12
	pxor  %xmm12,%xmm6
	movdqa %xmm10,%xmm0
	paddd %xmm5,%xmm0
	movdqa %xmm0,%xmm12
	pslld $18,%xmm0
	pxor  %xmm0,%xmm1
	psrld $14,%xmm12
	pxor  %xmm12,%xmm1
	movdqa 256(%rsp),%xmm0
	movdqa %xmm1,256(%rsp)
	movdqa %xmm4,%xmm1
	paddd %xmm0,%xmm1
	movdqa %xmm1,%xmm12
	pslld $7,%xmm1
	pxor  %xmm1,%xmm7
	psrld $25,%xmm12
	pxor  %xmm12,%xmm7
	movdqa %xmm8,%xmm1
	paddd %xmm6,%xmm1
	movdqa %xmm1,%xmm12
	pslld $18,%xmm1
	pxor  %xmm1,%xmm2
	psrld $14,%xmm12
	pxor  %xmm12,%xmm2
	movdqa 272(%rsp),%xmm12
	movdqa %xmm2,272(%rsp)
	movdqa %xmm14,%xmm1
	paddd %xmm12,%xmm1
	movdqa %xmm1,%xmm2
	pslld $7,%xmm1
	pxor  %xmm1,%xmm5
	psrld $25,%xmm2
	pxor  %xmm2,%xmm5
	movdqa %xmm0,%xmm1
	paddd %xmm7,%xmm1
	movdqa %xmm1,%xmm2
	pslld $9,%xmm1
	pxor  %xmm1,%xmm10
	psrld $23,%xmm2
	pxor  %xmm2,%xmm10
	movdqa %xmm12,%xmm1
	paddd %xmm5,%xmm1
	movdqa %xmm1,%xmm2
	pslld $9,%xmm1
	pxor  %xmm1,%xmm8
	psrld $23,%xmm2
	pxor  %xmm2,%xmm8
	movdqa %xmm7,%xmm1
	paddd %xmm10,%xmm1
	movdqa %xmm1,%xmm2
	pslld $13,%xmm1
	pxor  %xmm1,%xmm4
	psrld $19,%xmm2
	pxor  %xmm2,%xmm4
	movdqa %xmm5,%xmm1
	paddd %xmm8,%xmm1
	movdqa %xmm1,%xmm2
	pslld $13,%xmm1
	pxor  %xmm1,%xmm14
	psrld $19,%xmm2
	pxor  %xmm2,%xmm14
	movdqa %xmm10,%xmm1
	paddd %xmm4,%xmm1
	movdqa %xmm1,%xmm2
	pslld $18,%xmm1
	pxor  %xmm1,%xmm0
	psrld $14,%xmm2
	pxor  %xmm2,%xmm0
	movdqa 256(%rsp),%xmm1
	movdqa %xmm0,256(%rsp)
	movdqa %xmm8,%xmm0
	paddd %xmm14,%xmm0
	movdqa %xmm0,%xmm2
	pslld $18,%xmm0
	pxor  %xmm0,%xmm12
	psrld $14,%xmm2
	pxor  %xmm2,%xmm12
	movdqa %xmm11,%xmm0
	paddd %xmm1,%xmm0
	movdqa %xmm0,%xmm2
	pslld $7,%xmm0
	pxor  %xmm0,%xmm6
	psrld $25,%xmm2
	pxor  %xmm2,%xmm6
	movdqa 272(%rsp),%xmm2
	movdqa %xmm12,272(%rsp)
	movdqa %xmm3,%xmm0
	paddd %xmm2,%xmm0
	movdqa %xmm0,%xmm12
	pslld $7,%xmm0
	pxor  %xmm0,%xmm13
	psrld $25,%xmm12
	pxor  %xmm12,%xmm13
	movdqa %xmm1,%xmm0
	paddd %xmm6,%xmm0
	movdqa %xmm0,%xmm12
	pslld $9,%xmm0
	pxor  %xmm0,%xmm15
	psrld $23,%xmm12
	pxor  %xmm12,%xmm15
	movdqa %xmm2,%xmm0
	paddd %xmm13,%xmm0
	movdqa %xmm0,%xmm12
	pslld $9,%xmm0
	pxor  %xmm0,%xmm9
	psrld $23,%xmm12
	pxor  %xmm12,%xmm9
	movdqa %xmm6,%xmm0
	paddd %xmm15,%xmm0
	movdqa %xmm0,%xmm12
	pslld $13,%xmm0
	pxor  %xmm0,%xmm11
	psrld $19,%xmm12
	pxor  %xmm12,%xmm11
	movdqa %xmm13,%xmm0
	paddd %xmm9,%xmm0
	movdqa %xmm0,%xmm12
	pslld $13,%xmm0
	pxor  %xmm0,%xmm3
	psrld $19,%xmm12
	pxor  %xmm12,%xmm3
	movdqa %xmm15,%xmm0
	paddd %xmm11,%xmm0
	movdqa %xmm0,%xmm12
	pslld $18,%xmm0
	pxor  %xmm0,%xmm1
	psrld $14,%xmm12
	pxor  %xmm12,%xmm1
	movdqa %xmm9,%xmm0
	paddd %xmm3,%xmm0
	movdqa %xmm0,%xmm12
	pslld $18,%xmm0
	pxor  %xmm0,%xmm2
	psrld $14,%xmm12
	pxor  %xmm12,%xmm2
	movdqa 256(%rsp),%xmm12
	movdqa 272(%rsp),%xmm0
	sub  $2,%rdx
	ja .L_mainloop1
	paddd 48(%rsp),%xmm12
	paddd 112(%rsp),%xmm7
	paddd 160(%rsp),%xmm10
	paddd 208(%rsp),%xmm4
	movd   %xmm12,%rdx
	movd   %xmm7,%rcx
	movd   %xmm10,%r9
	movd   %xmm4,%rax
	pshufd $0x39,%xmm12,%xmm12
	pshufd $0x39,%xmm7,%xmm7
	pshufd $0x39,%xmm10,%xmm10
	pshufd $0x39,%xmm4,%xmm4
	xorl 0(%rsi),%edx
	xorl 4(%rsi),%ecx
	xorl 8(%rsi),%r9d
	xorl 12(%rsi),%eax
	movl   %edx,0(%rdi)
	movl   %ecx,4(%rdi)
	movl   %r9d,8(%rdi)
	movl   %eax,12(%rdi)
	movd   %xmm12,%rdx
	movd   %xmm7,%rcx
	movd   %xmm10,%r9
	movd   %xmm4,%rax
	pshufd $0x39,%xmm12,%xmm12
	pshufd $0x39,%xmm7,%xmm7
	pshufd $0x39,%xmm10,%xmm10
	pshufd $0x39,%xmm4,%xmm4
	xorl 64(%rsi),%edx
	xorl 68(%rsi),%ecx
	xorl 72(%rsi),%r9d
	xorl 76(%rsi),%eax
	movl   %edx,64(%rdi)
	movl   %ecx,68(%rdi)
	movl   %r9d,72(%rdi)
	movl   %eax,76(%rdi)
	movd   %xmm12,%rdx
	movd   %xmm7,%rcx
	movd   %xmm10,%r9
	movd   %xmm4,%rax
	pshufd $0x39,%xmm12,%xmm12
	pshufd $0x39,%xmm7,%xmm7
	pshufd $0x39,%xmm10,%xmm10
	pshufd $0x39,%xmm4,%xmm4
	xorl 128(%rsi),%edx
	xorl 132(%rsi),%ecx
	xorl 136(%rsi),%r9d
	xorl 140(%rsi),%eax
	movl   %edx,128(%rdi)
	movl   %ecx,132(%rdi)
	movl   %r9d,136(%rdi)
	movl   %eax,140(%rdi)
	movd   %xmm12,%rdx
	movd   %xmm7,%rcx
	movd   %xmm10,%r9
	movd   %xmm4,%rax
	xorl 192(%rsi),%edx
	xorl 196(%rsi),%ecx
	xorl 200(%rsi),%r9d
	xorl 204(%rsi),%eax
	movl   %edx,192(%rdi)
	movl   %ecx,196(%rdi)
	movl   %r9d,200(%rdi)
	movl   %eax,204(%rdi)
	paddd 176(%rsp),%xmm14
	paddd 0(%rsp),%xmm0
	paddd 64(%rsp),%xmm5
	paddd 128(%rsp),%xmm8
	movd   %xmm14,%rdx
	movd   %xmm0,%rcx
	movd   %xmm5,%r9
	movd   %xmm8,%rax
	pshufd $0x39,%xmm14,%xmm14
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm5,%xmm5
	pshufd $0x39,%xmm8,%xmm8
	xorl 16(%rsi),%edx
	xorl 20(%rsi),%ecx
	xorl 24(%rsi),%r9d
	xorl 28(%rsi),%eax
	movl   %edx,16(%rdi)
	movl   %ecx,20(%rdi)
	movl   %r9d,24(%rdi)
	movl   %eax,28(%rdi)
	movd   %xmm14,%rdx
	movd   %xmm0,%rcx
	movd   %xmm5,%r9
	movd   %xmm8,%rax
	pshufd $0x39,%xmm14,%xmm14
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm5,%xmm5
	pshufd $0x39,%xmm8,%xmm8
	xorl 80(%rsi),%edx
	xorl 84(%rsi),%ecx
	xorl 88(%rsi),%r9d
	xorl 92(%rsi),%eax
	movl   %edx,80(%rdi)
	movl   %ecx,84(%rdi)
	movl   %r9d,88(%rdi)
	movl   %eax,92(%rdi)
	movd   %xmm14,%rdx
	movd   %xmm0,%rcx
	movd   %xmm5,%r9
	movd   %xmm8,%rax
	pshufd $0x39,%xmm14,%xmm14
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm5,%xmm5
	pshufd $0x39,%xmm8,%xmm8
	xorl 144(%rsi),%edx
	xorl 148(%rsi),%ecx
	xorl 152(%rsi),%r9d
	xorl 156(%rsi),%eax
	movl   %edx,144(%rdi)
	movl   %ecx,148(%rdi)
	movl   %r9d,152(%rdi)
	movl   %eax,156(%rdi)
	movd   %xmm14,%rdx
	movd   %xmm0,%rcx
	movd   %xmm5,%r9
	movd   %xmm8,%rax
	xorl 208(%rsi),%edx
	xorl 212(%rsi),%ecx
	xorl 216(%rsi),%r9d
	xorl 220(%rsi),%eax
	movl   %edx,208(%rdi)
	movl   %ecx,212(%rdi)
	movl   %r9d,216(%rdi)
	movl   %eax,220(%rdi)
	paddd 224(%rsp),%xmm15
	paddd 240(%rsp),%xmm11
	paddd 16(%rsp),%xmm1
	paddd 80(%rsp),%xmm6
	movd   %xmm15,%rdx
	movd   %xmm11,%rcx
	movd   %xmm1,%r9
	movd   %xmm6,%rax
	pshufd $0x39,%xmm15,%xmm15
	pshufd $0x39,%xmm11,%xmm11
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm6,%xmm6
	xorl 32(%rsi),%edx
	xorl 36(%rsi),%ecx
	xorl 40(%rsi),%r9d
	xorl 44(%rsi),%eax
	movl   %edx,32(%rdi)
	movl   %ecx,36(%rdi)
	movl   %r9d,40(%rdi)
	movl   %eax,44(%rdi)
	movd   %xmm15,%rdx
	movd   %xmm11,%rcx
	movd   %xmm1,%r9
	movd   %xmm6,%rax
	pshufd $0x39,%xmm15,%xmm15
	pshufd $0x39,%xmm11,%xmm11
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm6,%xmm6
	xorl 96(%rsi),%edx
	xorl 100(%rsi),%ecx
	xorl 104(%rsi),%r9d
	xorl 108(%rsi),%eax
	movl   %edx,96(%rdi)
	movl   %ecx,100(%rdi)
	movl   %r9d,104(%rdi)
	movl   %eax,108(%rdi)
	movd   %xmm15,%rdx
	movd   %xmm11,%rcx
	movd   %xmm1,%r9
	movd   %xmm6,%rax
	pshufd $0x39,%xmm15,%xmm15
	pshufd $0x39,%xmm11,%xmm11
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm6,%xmm6
	xorl 160(%rsi),%edx
	xorl 164(%rsi),%ecx
	xorl 168(%rsi),%r9d
	xorl 172(%rsi),%eax
	movl   %edx,160(%rdi)
	movl   %ecx,164(%rdi)
	movl   %r9d,168(%rdi)
	movl   %eax,172(%rdi)
	movd   %xmm15,%rdx
	movd   %xmm11,%rcx
	movd   %xmm1,%r9
	movd   %xmm6,%rax
	xorl 224(%rsi),%edx
	xorl 228(%rsi),%ecx
	xorl 232(%rsi),%r9d
	xorl 236(%rsi),%eax
	movl   %edx,224(%rdi)
	movl   %ecx,228(%rdi)
	movl   %r9d,232(%rdi)
	movl   %eax,236(%rdi)
	paddd 96(%rsp),%xmm13
	paddd 144(%rsp),%xmm9
	paddd 192(%rsp),%xmm3
	paddd 32(%rsp),%xmm2
	movd   %xmm13,%rdx
	movd   %xmm9,%rcx
	movd   %xmm3,%r9
	movd   %xmm2,%rax
	pshufd $0x39,%xmm13,%xmm13
	pshufd $0x39,%xmm9,%xmm9
	pshufd $0x39,%xmm3,%xmm3
	pshufd $0x39,%xmm2,%xmm2
	xorl 48(%rsi),%edx
	xorl 52(%rsi),%ecx
	xorl 56(%rsi),%r9d
	xorl 60(%rsi),%eax
	movl   %edx,48(%rdi)
	movl   %ecx,52(%rdi)
	movl   %r9d,56(%rdi)
	movl   %eax,60(%rdi)
	movd   %xmm13,%rdx
	movd   %xmm9,%rcx
	movd   %xmm3,%r9
	movd   %xmm2,%rax
	pshufd $0x39,%xmm13,%xmm13
	pshufd $0x39,%xmm9,%xmm9
	pshufd $0x39,%xmm3,%xmm3
	pshufd $0x39,%xmm2,%xmm2
	xorl 112(%rsi),%edx
	xorl 116(%rsi),%ecx
	xorl 120(%rsi),%r9d
	xorl 124(%rsi),%eax
	movl   %edx,112(%rdi)
	movl   %ecx,116(%rdi)
	movl   %r9d,120(%rdi)
	movl   %eax,124(%rdi)
	movd   %xmm13,%rdx
	movd   %xmm9,%rcx
	movd   %xmm3,%r9
	movd   %xmm2,%rax
	pshufd $0x39,%xmm13,%xmm13
	pshufd $0x39,%xmm9,%xmm9
	pshufd $0x39,%xmm3,%xmm3
	pshufd $0x39,%xmm2,%xmm2
	xorl 176(%rsi),%edx
	xorl 180(%rsi),%ecx
	xorl 184(%rsi),%r9d
	xorl 188(%rsi),%eax
	movl   %edx,176(%rdi)
	movl   %ecx,180(%rdi)
	movl   %r9d,184(%rdi)
	movl   %eax,188(%rdi)
	movd   %xmm13,%rdx
	movd   %xmm9,%rcx
	movd   %xmm3,%r9
	movd   %xmm2,%rax
	xorl 240(%rsi),%edx
	xorl 244(%rsi),%ecx
	xorl 248(%rsi),%r9d
	xorl 252(%rsi),%eax
	movl   %edx,240(%rdi)
	movl   %ecx,244(%rdi)
	movl   %r9d,248(%rdi)
	movl   %eax,252(%rdi)
	movq 288(%rsp),%rdx
	sub  $256,%rdx
	add  $256,%rsi
	add  $256,%rdi
	cmp  $256,%rdx
	jae .L_bytesatleast256
	cmp  $0,%rdx
	jbe .L_done
.L_bytes_are_64_128_or_192:
	movq %rdx,288(%rsp)
	movdqa 0(%r8),%xmm0
	movdqa 16(%r8),%xmm1
	movdqa 32(%r8),%xmm2
	movdqa 48(%r8),%xmm3
	movdqa %xmm1,%xmm4
	mov  %rbx,%rdx
.L_mainloop2:
	paddd %xmm0,%xmm4
	movdqa %xmm0,%xmm5
	movdqa %xmm4,%xmm6
	pslld $7,%xmm4
	psrld $25,%xmm6
	pxor  %xmm4,%xmm3
	pxor  %xmm6,%xmm3
	paddd %xmm3,%xmm5
	movdqa %xmm3,%xmm4
	movdqa %xmm5,%xmm6
	pslld $9,%xmm5
	psrld $23,%xmm6
	pxor  %xmm5,%xmm2
	pshufd $0x93,%xmm3,%xmm3
	pxor  %xmm6,%xmm2
	paddd %xmm2,%xmm4
	movdqa %xmm2,%xmm5
	movdqa %xmm4,%xmm6
	pslld $13,%xmm4
	psrld $19,%xmm6
	pxor  %xmm4,%xmm1
	pshufd $0x4e,%xmm2,%xmm2
	pxor  %xmm6,%xmm1
	paddd %xmm1,%xmm5
	movdqa %xmm3,%xmm4
	movdqa %xmm5,%xmm6
	pslld $18,%xmm5
	psrld $14,%xmm6
	pxor  %xmm5,%xmm0
	pshufd $0x39,%xmm1,%xmm1
	pxor  %xmm6,%xmm0
	paddd %xmm0,%xmm4
	movdqa %xmm0,%xmm5
	movdqa %xmm4,%xmm6
	pslld $7,%xmm4
	psrld $25,%xmm6
	pxor  %xmm4,%xmm1
	pxor  %xmm6,%xmm1
	paddd %xmm1,%xmm5
	movdqa %xmm1,%xmm4
	movdqa %xmm5,%xmm6
	pslld $9,%xmm5
	psrld $23,%xmm6
	pxor  %xmm5,%xmm2
	pshufd $0x93,%xmm1,%xmm1
	pxor  %xmm6,%xmm2
	paddd %xmm2,%xmm4
	movdqa %xmm2,%xmm5
	movdqa %xmm4,%xmm6
	pslld $13,%xmm4
	psrld $19,%xmm6
	pxor  %xmm4,%xmm3
	pshufd $0x4e,%xmm2,%xmm2
	pxor  %xmm6,%xmm3
	paddd %xmm3,%xmm5
	movdqa %xmm1,%xmm4
	movdqa %xmm5,%xmm6
	pslld $18,%xmm5
	psrld $14,%xmm6
	pxor  %xmm5,%xmm0
	pshufd $0x39,%xmm3,%xmm3
	pxor  %xmm6,%xmm0
	paddd %xmm0,%xmm4
	movdqa %xmm0,%xmm5
	movdqa %xmm4,%xmm6
	pslld $7,%xmm4
	psrld $25,%xmm6
	pxor  %xmm4,%xmm3
	pxor  %xmm6,%xmm3
	paddd %xmm3,%xmm5
	movdqa %xmm3,%xmm4
	movdqa %xmm5,%xmm6
	pslld $9,%xmm5
	psrld $23,%xmm6
	pxor  %xmm5,%xmm2
	pshufd $0x93,%xmm3,%xmm3
	pxor  %xmm6,%xmm2
	paddd %xmm2,%xmm4
	movdqa %xmm2,%xmm5
	movdqa %xmm4,%xmm6
	pslld $13,%xmm4
	psrld $19,%xmm6
	pxor  %xmm4,%xmm1
	pshufd $0x4e,%xmm2,%xmm2
	pxor  %xmm6,%xmm1
	paddd %xmm1,%xmm5
	movdqa %xmm3,%xmm4
	movdqa %xmm5,%xmm6
	pslld $18,%xmm5
	psrld $14,%xmm6
	pxor  %xmm5,%xmm0
	pshufd $0x39,%xmm1,%xmm1
	pxor  %xmm6,%xmm0
	paddd %xmm0,%xmm4
	movdqa %xmm0,%xmm5
	movdqa %xmm4,%xmm6
	pslld $7,%xmm4
	psrld $25,%xmm6
	pxor  %xmm4,%xmm1
	pxor  %xmm6,%xmm1
	paddd %xmm1,%xmm5
	movdqa %xmm1,%xmm4
	movdqa %xmm5,%xmm6
	pslld $9,%xmm5
	psrld $23,%xmm6
	pxor  %xmm5,%xmm2
	pshufd $0x93,%xmm1,%xmm1
	pxor  %xmm6,%xmm2
	paddd %xmm2,%xmm4
	movdqa %xmm2,%xmm5
	movdqa %xmm4,%xmm6
	pslld $13,%xmm4
	psrld $19,%xmm6
	pxor  %xmm4,%xmm3
	pshufd $0x4e,%xmm2,%xmm2
	pxor  %xmm6,%xmm3
	sub  $4,%rdx
	paddd %xmm3,%xmm5
	movdqa %xmm1,%xmm4
	movdqa %xmm5,%xmm6
	pslld $18,%xmm5
	pxor   %xmm7,%xmm7
	psrld $14,%xmm6
	pxor  %xmm5,%xmm0
	pshufd $0x39,%xmm3,%xmm3
	pxor  %xmm6,%xmm0
	ja .L_mainloop2
	paddd 0(%r8),%xmm0
	paddd 16(%r8),%xmm1
	paddd 32(%r8),%xmm2
	paddd 48(%r8),%xmm3
	movd   %xmm0,%rdx
	movd   %xmm1,%rcx
	movd   %xmm2,%rax
	movd   %xmm3,%r10
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm2,%xmm2
	pshufd $0x39,%xmm3,%xmm3
	xorl 0(%rsi),%edx
	xorl 48(%rsi),%ecx
	xorl 32(%rsi),%eax
	xorl 16(%rsi),%r10d
	movl   %edx,0(%rdi)
	movl   %ecx,48(%rdi)
	movl   %eax,32(%rdi)
	movl   %r10d,16(%rdi)
	movd   %xmm0,%rdx
	movd   %xmm1,%rcx
	movd   %xmm2,%rax
	movd   %xmm3,%r10
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm2,%xmm2
	pshufd $0x39,%xmm3,%xmm3
	xorl 20(%rsi),%edx
	xorl 4(%rsi),%ecx
	xorl 52(%rsi),%eax
	xorl 36(%rsi),%r10d
	movl   %edx,20(%rdi)
	movl   %ecx,4(%rdi)
	movl   %eax,52(%rdi)
	movl   %r10d,36(%rdi)
	movd   %xmm0,%rdx
	movd   %xmm1,%rcx
	movd   %xmm2,%rax
	movd   %xmm3,%r10
	pshufd $0x39,%xmm0,%xmm0
	pshufd $0x39,%xmm1,%xmm1
	pshufd $0x39,%xmm2,%xmm2
	pshufd $0x39,%xmm3,%xmm3
	xorl 40(%rsi),%edx
	xorl 24(%rsi),%ecx
	xorl 8(%rsi),%eax
	xorl 56(%rsi),%r10d
	movl   %edx,40(%rdi)
	movl   %ecx,24(%rdi)
	movl   %eax,8(%rdi)
	movl   %r10d,56(%rdi)
	movd   %xmm0,%rdx
	movd   %xmm1,%rcx
	movd   %xmm2,%rax
	movd   %xmm3,%r10
	xorl 60(%rsi),%edx
	xorl 44(%rsi),%ecx
	xorl 28(%rsi),%eax
	xorl 12(%rsi),%r10d
	movl   %edx,60(%rdi)
	movl   %ecx,44(%rdi)
	movl   %eax,28(%rdi)
	movl   %r10d,12(%rdi)
	movq 288(%rsp),%rdx
	movl   32(%r8),%ecx
	movl   52(%r8),%eax
	add  $1,%ecx
	adc  $0,%eax
	movl   %ecx,32(%r8)
	movl   %eax,52(%r8)
	cmp  $64,%rdx
	ja .L_bytes_are_128_or_192
.L_done:
	add %r11,%rsp
	mov %r11,%rax
	pop %rbx
	ret
.L_bytes_are_128_or_192:
	sub  $64,%rdx
	add  $64,%rdi
	add  $64,%rsi
	jmp .L_bytes_are_64_128_or_192
ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)

#endif /*defined(USE_SALSA20)*/
#endif /*__x86_64*/
