Based on Nekogram. Key additions: - Rebrand to FoxiGram (app name, APK name, applicationId com.foxigram.app) - Embedded Xray (VLESS+Reality) proxy client via JNI libxray.so - Bundled hidden one-tap proxies (LTE + WiFi), read-only in UI - Auto-restore proxy on restart, rebind to active network (LTE/WiFi) - Server credentials externalized to git-ignored XrayServers.java (+ template) - libxray Go source included; compiled .so, keystore, google-services.json ignored
1254 lines
25 KiB
ArmAsm
1254 lines
25 KiB
ArmAsm
// This file is generated from a similarly-named Perl script in the BoringSSL
|
|
// source tree. Do not edit by hand.
|
|
|
|
#include <openssl/asm_base.h>
|
|
|
|
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
|
|
.section .rodata
|
|
.align 64
|
|
|
|
|
|
.Lbswap_mask:
|
|
.quad 0x08090a0b0c0d0e0f, 0x0001020304050607
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.Lgfpoly:
|
|
.quad 1, 0xc200000000000000
|
|
|
|
|
|
.Lgfpoly_and_internal_carrybit:
|
|
.quad 1, 0xc200000000000001
|
|
|
|
|
|
.Lctr_pattern:
|
|
.quad 0, 0
|
|
.quad 1, 0
|
|
.quad 2, 0
|
|
.quad 3, 0
|
|
|
|
|
|
.Linc_4blocks:
|
|
.quad 4, 0
|
|
|
|
.text
|
|
.globl gcm_init_vpclmulqdq_avx512
|
|
.hidden gcm_init_vpclmulqdq_avx512
|
|
.type gcm_init_vpclmulqdq_avx512,@function
|
|
.align 32
|
|
gcm_init_vpclmulqdq_avx512:
|
|
.cfi_startproc
|
|
|
|
_CET_ENDBR
|
|
|
|
leaq 256-64(%rdi),%r8
|
|
|
|
|
|
|
|
vpshufd $0x4e,(%rsi),%xmm3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpshufd $0xd3,%xmm3,%xmm0
|
|
vpsrad $31,%xmm0,%xmm0
|
|
vpaddq %xmm3,%xmm3,%xmm3
|
|
|
|
vpternlogd $0x78,.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3
|
|
|
|
|
|
vbroadcasti32x4 .Lgfpoly(%rip),%zmm5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0
|
|
vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1
|
|
vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2
|
|
vpxord %xmm2,%xmm1,%xmm1
|
|
vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2
|
|
vpshufd $0x4e,%xmm0,%xmm0
|
|
vpternlogd $0x96,%xmm2,%xmm0,%xmm1
|
|
vpclmulqdq $0x11,%xmm3,%xmm3,%xmm4
|
|
vpclmulqdq $0x01,%xmm1,%xmm5,%xmm0
|
|
vpshufd $0x4e,%xmm1,%xmm1
|
|
vpternlogd $0x96,%xmm0,%xmm1,%xmm4
|
|
|
|
|
|
|
|
vinserti128 $1,%xmm3,%ymm4,%ymm3
|
|
vinserti128 $1,%xmm4,%ymm4,%ymm4
|
|
|
|
|
|
vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0
|
|
vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1
|
|
vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2
|
|
vpxord %ymm2,%ymm1,%ymm1
|
|
vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2
|
|
vpshufd $0x4e,%ymm0,%ymm0
|
|
vpternlogd $0x96,%ymm2,%ymm0,%ymm1
|
|
vpclmulqdq $0x11,%ymm4,%ymm3,%ymm4
|
|
vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0
|
|
vpshufd $0x4e,%ymm1,%ymm1
|
|
vpternlogd $0x96,%ymm0,%ymm1,%ymm4
|
|
|
|
vinserti64x4 $1,%ymm3,%zmm4,%zmm3
|
|
vshufi64x2 $0,%zmm4,%zmm4,%zmm4
|
|
|
|
|
|
vmovdqu8 %zmm3,(%r8)
|
|
|
|
|
|
|
|
|
|
movl $3,%eax
|
|
.Lprecompute_next:
|
|
subq $64,%r8
|
|
vpclmulqdq $0x00,%zmm4,%zmm3,%zmm0
|
|
vpclmulqdq $0x01,%zmm4,%zmm3,%zmm1
|
|
vpclmulqdq $0x10,%zmm4,%zmm3,%zmm2
|
|
vpxord %zmm2,%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm0,%zmm5,%zmm2
|
|
vpshufd $0x4e,%zmm0,%zmm0
|
|
vpternlogd $0x96,%zmm2,%zmm0,%zmm1
|
|
vpclmulqdq $0x11,%zmm4,%zmm3,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm5,%zmm0
|
|
vpshufd $0x4e,%zmm1,%zmm1
|
|
vpternlogd $0x96,%zmm0,%zmm1,%zmm3
|
|
|
|
vmovdqu8 %zmm3,(%r8)
|
|
decl %eax
|
|
jnz .Lprecompute_next
|
|
|
|
vzeroupper
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size gcm_init_vpclmulqdq_avx512, . - gcm_init_vpclmulqdq_avx512
|
|
.globl gcm_gmult_vpclmulqdq_avx512
|
|
.hidden gcm_gmult_vpclmulqdq_avx512
|
|
.type gcm_gmult_vpclmulqdq_avx512,@function
|
|
.align 32
|
|
gcm_gmult_vpclmulqdq_avx512:
|
|
.cfi_startproc
|
|
|
|
_CET_ENDBR
|
|
|
|
|
|
|
|
vmovdqu (%rdi),%xmm0
|
|
vmovdqu .Lbswap_mask(%rip),%xmm1
|
|
vmovdqu 256-16(%rsi),%xmm2
|
|
vmovdqu .Lgfpoly(%rip),%xmm3
|
|
vpshufb %xmm1,%xmm0,%xmm0
|
|
|
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4
|
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5
|
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6
|
|
vpxord %xmm6,%xmm5,%xmm5
|
|
vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6
|
|
vpshufd $0x4e,%xmm4,%xmm4
|
|
vpternlogd $0x96,%xmm6,%xmm4,%xmm5
|
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0
|
|
vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4
|
|
vpshufd $0x4e,%xmm5,%xmm5
|
|
vpternlogd $0x96,%xmm4,%xmm5,%xmm0
|
|
|
|
|
|
vpshufb %xmm1,%xmm0,%xmm0
|
|
vmovdqu %xmm0,(%rdi)
|
|
|
|
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size gcm_gmult_vpclmulqdq_avx512, . - gcm_gmult_vpclmulqdq_avx512
|
|
.globl gcm_ghash_vpclmulqdq_avx512
|
|
.hidden gcm_ghash_vpclmulqdq_avx512
|
|
.type gcm_ghash_vpclmulqdq_avx512,@function
|
|
.align 32
|
|
gcm_ghash_vpclmulqdq_avx512:
|
|
.cfi_startproc
|
|
|
|
_CET_ENDBR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovdqu .Lbswap_mask(%rip),%xmm4
|
|
vmovdqu .Lgfpoly(%rip),%xmm10
|
|
|
|
|
|
vmovdqu (%rdi),%xmm5
|
|
vpshufb %xmm4,%xmm5,%xmm5
|
|
|
|
|
|
cmpq $64,%rcx
|
|
jb .Laad_blockbyblock
|
|
|
|
|
|
|
|
vshufi64x2 $0,%zmm4,%zmm4,%zmm4
|
|
vshufi64x2 $0,%zmm10,%zmm10,%zmm10
|
|
|
|
|
|
vmovdqu8 256-64(%rsi),%zmm9
|
|
|
|
cmpq $256,%rcx
|
|
jb .Laad_loop_1x
|
|
|
|
|
|
vmovdqu8 256-256(%rsi),%zmm6
|
|
vmovdqu8 256-192(%rsi),%zmm7
|
|
vmovdqu8 256-128(%rsi),%zmm8
|
|
|
|
|
|
.Laad_loop_4x:
|
|
vmovdqu8 0(%rdx),%zmm0
|
|
vmovdqu8 64(%rdx),%zmm1
|
|
vmovdqu8 128(%rdx),%zmm2
|
|
vmovdqu8 192(%rdx),%zmm3
|
|
vpshufb %zmm4,%zmm0,%zmm0
|
|
vpxord %zmm5,%zmm0,%zmm0
|
|
vpshufb %zmm4,%zmm1,%zmm1
|
|
vpshufb %zmm4,%zmm2,%zmm2
|
|
vpshufb %zmm4,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%zmm6,%zmm0,%zmm5
|
|
vpclmulqdq $0x00,%zmm7,%zmm1,%zmm11
|
|
vpclmulqdq $0x00,%zmm8,%zmm2,%zmm12
|
|
vpxord %zmm11,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%zmm9,%zmm3,%zmm13
|
|
vpternlogd $0x96,%zmm13,%zmm12,%zmm5
|
|
vpclmulqdq $0x01,%zmm6,%zmm0,%zmm11
|
|
vpclmulqdq $0x01,%zmm7,%zmm1,%zmm12
|
|
vpclmulqdq $0x01,%zmm8,%zmm2,%zmm13
|
|
vpternlogd $0x96,%zmm13,%zmm12,%zmm11
|
|
vpclmulqdq $0x01,%zmm9,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm6,%zmm0,%zmm13
|
|
vpternlogd $0x96,%zmm13,%zmm12,%zmm11
|
|
vpclmulqdq $0x10,%zmm7,%zmm1,%zmm12
|
|
vpclmulqdq $0x10,%zmm8,%zmm2,%zmm13
|
|
vpternlogd $0x96,%zmm13,%zmm12,%zmm11
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm13
|
|
vpclmulqdq $0x10,%zmm9,%zmm3,%zmm12
|
|
vpxord %zmm12,%zmm11,%zmm11
|
|
vpshufd $0x4e,%zmm5,%zmm5
|
|
vpclmulqdq $0x11,%zmm6,%zmm0,%zmm0
|
|
vpclmulqdq $0x11,%zmm7,%zmm1,%zmm1
|
|
vpclmulqdq $0x11,%zmm8,%zmm2,%zmm2
|
|
vpternlogd $0x96,%zmm13,%zmm5,%zmm11
|
|
vpclmulqdq $0x11,%zmm9,%zmm3,%zmm3
|
|
vpternlogd $0x96,%zmm2,%zmm1,%zmm0
|
|
vpclmulqdq $0x01,%zmm11,%zmm10,%zmm12
|
|
vpxord %zmm3,%zmm0,%zmm5
|
|
vpshufd $0x4e,%zmm11,%zmm11
|
|
vpternlogd $0x96,%zmm12,%zmm11,%zmm5
|
|
vextracti32x4 $1,%zmm5,%xmm0
|
|
vextracti32x4 $2,%zmm5,%xmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpxord %xmm0,%xmm5,%xmm5
|
|
vpternlogd $0x96,%xmm1,%xmm2,%xmm5
|
|
|
|
addq $256,%rdx
|
|
subq $256,%rcx
|
|
cmpq $256,%rcx
|
|
jae .Laad_loop_4x
|
|
|
|
|
|
cmpq $64,%rcx
|
|
jb .Laad_large_done
|
|
.Laad_loop_1x:
|
|
vmovdqu8 (%rdx),%zmm0
|
|
vpshufb %zmm4,%zmm0,%zmm0
|
|
vpxord %zmm0,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%zmm9,%zmm5,%zmm0
|
|
vpclmulqdq $0x01,%zmm9,%zmm5,%zmm1
|
|
vpclmulqdq $0x10,%zmm9,%zmm5,%zmm2
|
|
vpxord %zmm2,%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm0,%zmm10,%zmm2
|
|
vpshufd $0x4e,%zmm0,%zmm0
|
|
vpternlogd $0x96,%zmm2,%zmm0,%zmm1
|
|
vpclmulqdq $0x11,%zmm9,%zmm5,%zmm5
|
|
vpclmulqdq $0x01,%zmm1,%zmm10,%zmm0
|
|
vpshufd $0x4e,%zmm1,%zmm1
|
|
vpternlogd $0x96,%zmm0,%zmm1,%zmm5
|
|
|
|
vextracti32x4 $1,%zmm5,%xmm0
|
|
vextracti32x4 $2,%zmm5,%xmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpxord %xmm0,%xmm5,%xmm5
|
|
vpternlogd $0x96,%xmm1,%xmm2,%xmm5
|
|
|
|
addq $64,%rdx
|
|
subq $64,%rcx
|
|
cmpq $64,%rcx
|
|
jae .Laad_loop_1x
|
|
|
|
.Laad_large_done:
|
|
|
|
|
|
.Laad_blockbyblock:
|
|
testq %rcx,%rcx
|
|
jz .Laad_done
|
|
vmovdqu 256-16(%rsi),%xmm9
|
|
.Laad_loop_blockbyblock:
|
|
vmovdqu (%rdx),%xmm0
|
|
vpshufb %xmm4,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm5,%xmm5
|
|
vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0
|
|
vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1
|
|
vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2
|
|
vpxord %xmm2,%xmm1,%xmm1
|
|
vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2
|
|
vpshufd $0x4e,%xmm0,%xmm0
|
|
vpternlogd $0x96,%xmm2,%xmm0,%xmm1
|
|
vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5
|
|
vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0
|
|
vpshufd $0x4e,%xmm1,%xmm1
|
|
vpternlogd $0x96,%xmm0,%xmm1,%xmm5
|
|
|
|
addq $16,%rdx
|
|
subq $16,%rcx
|
|
jnz .Laad_loop_blockbyblock
|
|
|
|
.Laad_done:
|
|
|
|
vpshufb %xmm4,%xmm5,%xmm5
|
|
vmovdqu %xmm5,(%rdi)
|
|
|
|
vzeroupper
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size gcm_ghash_vpclmulqdq_avx512, . - gcm_ghash_vpclmulqdq_avx512
|
|
.globl aes_gcm_enc_update_vaes_avx512
|
|
.hidden aes_gcm_enc_update_vaes_avx512
|
|
.type aes_gcm_enc_update_vaes_avx512,@function
|
|
.align 32
|
|
aes_gcm_enc_update_vaes_avx512:
|
|
.cfi_startproc
|
|
|
|
_CET_ENDBR
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-16
|
|
|
|
movq 16(%rsp),%r12
|
|
#ifdef BORINGSSL_DISPATCH_TEST
|
|
.extern BORINGSSL_function_hit
|
|
.hidden BORINGSSL_function_hit
|
|
movb $1,BORINGSSL_function_hit+7(%rip)
|
|
#endif
|
|
|
|
vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8
|
|
vbroadcasti32x4 .Lgfpoly(%rip),%zmm31
|
|
|
|
|
|
|
|
vmovdqu (%r12),%xmm10
|
|
vpshufb %xmm8,%xmm10,%xmm10
|
|
vbroadcasti32x4 (%r8),%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm12
|
|
|
|
|
|
|
|
movl 240(%rcx),%r10d
|
|
leal -20(,%r10,4),%r10d
|
|
|
|
|
|
|
|
|
|
leaq 96(%rcx,%r10,4),%r11
|
|
vbroadcasti32x4 (%rcx),%zmm13
|
|
vbroadcasti32x4 (%r11),%zmm14
|
|
|
|
|
|
vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12
|
|
|
|
|
|
vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11
|
|
|
|
|
|
|
|
cmpq $256,%rdx
|
|
jb .Lcrypt_loop_4x_done__func1
|
|
|
|
|
|
vmovdqu8 256-256(%r9),%zmm27
|
|
vmovdqu8 256-192(%r9),%zmm28
|
|
vmovdqu8 256-128(%r9),%zmm29
|
|
vmovdqu8 256-64(%r9),%zmm30
|
|
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm1
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm2
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm3
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
|
|
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
vpxord %zmm13,%zmm1,%zmm1
|
|
vpxord %zmm13,%zmm2,%zmm2
|
|
vpxord %zmm13,%zmm3,%zmm3
|
|
|
|
leaq 16(%rcx),%rax
|
|
.Lvaesenc_loop_first_4_vecs__func1:
|
|
vbroadcasti32x4 (%rax),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
addq $16,%rax
|
|
cmpq %rax,%r11
|
|
jne .Lvaesenc_loop_first_4_vecs__func1
|
|
vpxord 0(%rdi),%zmm14,%zmm4
|
|
vpxord 64(%rdi),%zmm14,%zmm5
|
|
vpxord 128(%rdi),%zmm14,%zmm6
|
|
vpxord 192(%rdi),%zmm14,%zmm7
|
|
vaesenclast %zmm4,%zmm0,%zmm4
|
|
vaesenclast %zmm5,%zmm1,%zmm5
|
|
vaesenclast %zmm6,%zmm2,%zmm6
|
|
vaesenclast %zmm7,%zmm3,%zmm7
|
|
vmovdqu8 %zmm4,0(%rsi)
|
|
vmovdqu8 %zmm5,64(%rsi)
|
|
vmovdqu8 %zmm6,128(%rsi)
|
|
vmovdqu8 %zmm7,192(%rsi)
|
|
|
|
addq $256,%rdi
|
|
addq $256,%rsi
|
|
subq $256,%rdx
|
|
cmpq $256,%rdx
|
|
jb .Lghash_last_ciphertext_4x__func1
|
|
|
|
vbroadcasti32x4 -144(%r11),%zmm15
|
|
vbroadcasti32x4 -128(%r11),%zmm16
|
|
vbroadcasti32x4 -112(%r11),%zmm17
|
|
vbroadcasti32x4 -96(%r11),%zmm18
|
|
vbroadcasti32x4 -80(%r11),%zmm19
|
|
vbroadcasti32x4 -64(%r11),%zmm20
|
|
vbroadcasti32x4 -48(%r11),%zmm21
|
|
vbroadcasti32x4 -32(%r11),%zmm22
|
|
vbroadcasti32x4 -16(%r11),%zmm23
|
|
|
|
.Lcrypt_loop_4x__func1:
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm1
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm2
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm3
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
|
|
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
vpxord %zmm13,%zmm1,%zmm1
|
|
vpxord %zmm13,%zmm2,%zmm2
|
|
vpxord %zmm13,%zmm3,%zmm3
|
|
|
|
cmpl $24,%r10d
|
|
jl .Laes128__func1
|
|
je .Laes192__func1
|
|
|
|
vbroadcasti32x4 -208(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
vbroadcasti32x4 -192(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
.Laes192__func1:
|
|
vbroadcasti32x4 -176(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
vbroadcasti32x4 -160(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
.Laes128__func1:
|
|
|
|
|
|
|
|
|
|
prefetcht0 512+0(%rdi)
|
|
prefetcht0 512+64(%rdi)
|
|
prefetcht0 512+128(%rdi)
|
|
prefetcht0 512+192(%rdi)
|
|
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm4,%zmm4
|
|
vpxord %zmm10,%zmm4,%zmm4
|
|
vpshufb %zmm8,%zmm5,%zmm5
|
|
vpshufb %zmm8,%zmm6,%zmm6
|
|
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm1,%zmm1
|
|
vaesenc %zmm15,%zmm2,%zmm2
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
|
|
vpshufb %zmm8,%zmm7,%zmm7
|
|
vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
|
|
vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
|
|
vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
|
|
|
|
vaesenc %zmm16,%zmm0,%zmm0
|
|
vaesenc %zmm16,%zmm1,%zmm1
|
|
vaesenc %zmm16,%zmm2,%zmm2
|
|
vaesenc %zmm16,%zmm3,%zmm3
|
|
|
|
vpxord %zmm24,%zmm10,%zmm10
|
|
vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm10
|
|
vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
|
|
|
|
vaesenc %zmm17,%zmm0,%zmm0
|
|
vaesenc %zmm17,%zmm1,%zmm1
|
|
vaesenc %zmm17,%zmm2,%zmm2
|
|
vaesenc %zmm17,%zmm3,%zmm3
|
|
|
|
vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
|
|
|
|
vaesenc %zmm18,%zmm0,%zmm0
|
|
vaesenc %zmm18,%zmm1,%zmm1
|
|
vaesenc %zmm18,%zmm2,%zmm2
|
|
vaesenc %zmm18,%zmm3,%zmm3
|
|
|
|
vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
|
|
|
|
vaesenc %zmm19,%zmm0,%zmm0
|
|
vaesenc %zmm19,%zmm1,%zmm1
|
|
vaesenc %zmm19,%zmm2,%zmm2
|
|
vaesenc %zmm19,%zmm3,%zmm3
|
|
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
|
|
vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
|
|
vpxord %zmm25,%zmm24,%zmm24
|
|
|
|
vaesenc %zmm20,%zmm0,%zmm0
|
|
vaesenc %zmm20,%zmm1,%zmm1
|
|
vaesenc %zmm20,%zmm2,%zmm2
|
|
vaesenc %zmm20,%zmm3,%zmm3
|
|
|
|
vpshufd $0x4e,%zmm10,%zmm10
|
|
vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
|
|
vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
|
|
vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
|
|
|
|
vaesenc %zmm21,%zmm0,%zmm0
|
|
vaesenc %zmm21,%zmm1,%zmm1
|
|
vaesenc %zmm21,%zmm2,%zmm2
|
|
vaesenc %zmm21,%zmm3,%zmm3
|
|
|
|
vpternlogd $0x96,%zmm26,%zmm10,%zmm24
|
|
vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
|
|
vpternlogd $0x96,%zmm6,%zmm5,%zmm4
|
|
vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
|
|
|
|
vaesenc %zmm22,%zmm0,%zmm0
|
|
vaesenc %zmm22,%zmm1,%zmm1
|
|
vaesenc %zmm22,%zmm2,%zmm2
|
|
vaesenc %zmm22,%zmm3,%zmm3
|
|
|
|
vpxord %zmm7,%zmm4,%zmm10
|
|
vpshufd $0x4e,%zmm24,%zmm24
|
|
vpternlogd $0x96,%zmm25,%zmm24,%zmm10
|
|
|
|
vaesenc %zmm23,%zmm0,%zmm0
|
|
vaesenc %zmm23,%zmm1,%zmm1
|
|
vaesenc %zmm23,%zmm2,%zmm2
|
|
vaesenc %zmm23,%zmm3,%zmm3
|
|
|
|
|
|
vextracti32x4 $1,%zmm10,%xmm4
|
|
vextracti32x4 $2,%zmm10,%xmm5
|
|
vextracti32x4 $3,%zmm10,%xmm6
|
|
vpxord %xmm4,%xmm10,%xmm10
|
|
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
|
|
|
|
vpxord 0(%rdi),%zmm14,%zmm4
|
|
vpxord 64(%rdi),%zmm14,%zmm5
|
|
vpxord 128(%rdi),%zmm14,%zmm6
|
|
vpxord 192(%rdi),%zmm14,%zmm7
|
|
vaesenclast %zmm4,%zmm0,%zmm4
|
|
vaesenclast %zmm5,%zmm1,%zmm5
|
|
vaesenclast %zmm6,%zmm2,%zmm6
|
|
vaesenclast %zmm7,%zmm3,%zmm7
|
|
vmovdqu8 %zmm4,0(%rsi)
|
|
vmovdqu8 %zmm5,64(%rsi)
|
|
vmovdqu8 %zmm6,128(%rsi)
|
|
vmovdqu8 %zmm7,192(%rsi)
|
|
|
|
addq $256,%rdi
|
|
addq $256,%rsi
|
|
subq $256,%rdx
|
|
cmpq $256,%rdx
|
|
jae .Lcrypt_loop_4x__func1
|
|
.Lghash_last_ciphertext_4x__func1:
|
|
vpshufb %zmm8,%zmm4,%zmm4
|
|
vpxord %zmm10,%zmm4,%zmm4
|
|
vpshufb %zmm8,%zmm5,%zmm5
|
|
vpshufb %zmm8,%zmm6,%zmm6
|
|
vpshufb %zmm8,%zmm7,%zmm7
|
|
vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
|
|
vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
|
|
vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
|
|
vpxord %zmm24,%zmm10,%zmm10
|
|
vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm10
|
|
vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
|
|
vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
|
|
vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
|
|
vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
|
|
vpxord %zmm25,%zmm24,%zmm24
|
|
vpshufd $0x4e,%zmm10,%zmm10
|
|
vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
|
|
vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
|
|
vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
|
|
vpternlogd $0x96,%zmm26,%zmm10,%zmm24
|
|
vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
|
|
vpternlogd $0x96,%zmm6,%zmm5,%zmm4
|
|
vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
|
|
vpxord %zmm7,%zmm4,%zmm10
|
|
vpshufd $0x4e,%zmm24,%zmm24
|
|
vpternlogd $0x96,%zmm25,%zmm24,%zmm10
|
|
vextracti32x4 $1,%zmm10,%xmm4
|
|
vextracti32x4 $2,%zmm10,%xmm5
|
|
vextracti32x4 $3,%zmm10,%xmm6
|
|
vpxord %xmm4,%xmm10,%xmm10
|
|
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
|
|
|
|
.Lcrypt_loop_4x_done__func1:
|
|
|
|
testq %rdx,%rdx
|
|
jz .Ldone__func1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %rdx,%rax
|
|
negq %rax
|
|
andq $-16,%rax
|
|
leaq 256(%r9,%rax,1),%r8
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vpxor %xmm5,%xmm5,%xmm5
|
|
vpxor %xmm6,%xmm6,%xmm6
|
|
|
|
cmpq $64,%rdx
|
|
jb .Lpartial_vec__func1
|
|
|
|
.Lcrypt_loop_1x__func1:
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
leaq 16(%rcx),%rax
|
|
.Lvaesenc_loop_tail_full_vec__func1:
|
|
vbroadcasti32x4 (%rax),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
addq $16,%rax
|
|
cmpq %rax,%r11
|
|
jne .Lvaesenc_loop_tail_full_vec__func1
|
|
vaesenclast %zmm14,%zmm0,%zmm0
|
|
|
|
|
|
vmovdqu8 (%rdi),%zmm1
|
|
vpxord %zmm1,%zmm0,%zmm0
|
|
vmovdqu8 %zmm0,(%rsi)
|
|
|
|
|
|
vmovdqu8 (%r8),%zmm30
|
|
vpshufb %zmm8,%zmm0,%zmm0
|
|
vpxord %zmm10,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
|
|
vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
|
|
vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
|
|
vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
|
|
vpxord %zmm7,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm2,%zmm1,%zmm5
|
|
vpxord %zmm3,%zmm6,%zmm6
|
|
|
|
vpxor %xmm10,%xmm10,%xmm10
|
|
|
|
addq $64,%r8
|
|
addq $64,%rdi
|
|
addq $64,%rsi
|
|
subq $64,%rdx
|
|
cmpq $64,%rdx
|
|
jae .Lcrypt_loop_1x__func1
|
|
|
|
testq %rdx,%rdx
|
|
jz .Lreduce__func1
|
|
|
|
.Lpartial_vec__func1:
|
|
|
|
|
|
|
|
|
|
movq $-1,%rax
|
|
bzhiq %rdx,%rax,%rax
|
|
kmovq %rax,%k1
|
|
addq $15,%rdx
|
|
andq $-16,%rdx
|
|
movq $-1,%rax
|
|
bzhiq %rdx,%rax,%rax
|
|
kmovq %rax,%k2
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
leaq 16(%rcx),%rax
|
|
.Lvaesenc_loop_tail_partialvec__func1:
|
|
vbroadcasti32x4 (%rax),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
addq $16,%rax
|
|
cmpq %rax,%r11
|
|
jne .Lvaesenc_loop_tail_partialvec__func1
|
|
vaesenclast %zmm14,%zmm0,%zmm0
|
|
|
|
|
|
vmovdqu8 (%rdi),%zmm1{%k1}{z}
|
|
vpxord %zmm1,%zmm0,%zmm0
|
|
vmovdqu8 %zmm0,(%rsi){%k1}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovdqu8 (%r8),%zmm30{%k2}{z}
|
|
vmovdqu8 %zmm0,%zmm1{%k1}{z}
|
|
vpshufb %zmm8,%zmm1,%zmm0
|
|
vpxord %zmm10,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
|
|
vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
|
|
vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
|
|
vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
|
|
vpxord %zmm7,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm2,%zmm1,%zmm5
|
|
vpxord %zmm3,%zmm6,%zmm6
|
|
|
|
|
|
.Lreduce__func1:
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
|
|
vpshufd $0x4e,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm0,%zmm4,%zmm5
|
|
vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
|
|
vpshufd $0x4e,%zmm5,%zmm5
|
|
vpternlogd $0x96,%zmm0,%zmm5,%zmm6
|
|
|
|
vextracti32x4 $1,%zmm6,%xmm0
|
|
vextracti32x4 $2,%zmm6,%xmm1
|
|
vextracti32x4 $3,%zmm6,%xmm2
|
|
vpxord %xmm0,%xmm6,%xmm10
|
|
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
|
|
|
|
|
|
.Ldone__func1:
|
|
|
|
vpshufb %xmm8,%xmm10,%xmm10
|
|
vmovdqu %xmm10,(%r12)
|
|
|
|
vzeroupper
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size aes_gcm_enc_update_vaes_avx512, . - aes_gcm_enc_update_vaes_avx512
|
|
.globl aes_gcm_dec_update_vaes_avx512
|
|
.hidden aes_gcm_dec_update_vaes_avx512
|
|
.type aes_gcm_dec_update_vaes_avx512,@function
|
|
.align 32
|
|
aes_gcm_dec_update_vaes_avx512:
|
|
.cfi_startproc
|
|
|
|
_CET_ENDBR
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-16
|
|
|
|
movq 16(%rsp),%r12
|
|
|
|
vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8
|
|
vbroadcasti32x4 .Lgfpoly(%rip),%zmm31
|
|
|
|
|
|
|
|
vmovdqu (%r12),%xmm10
|
|
vpshufb %xmm8,%xmm10,%xmm10
|
|
vbroadcasti32x4 (%r8),%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm12
|
|
|
|
|
|
|
|
movl 240(%rcx),%r10d
|
|
leal -20(,%r10,4),%r10d
|
|
|
|
|
|
|
|
|
|
leaq 96(%rcx,%r10,4),%r11
|
|
vbroadcasti32x4 (%rcx),%zmm13
|
|
vbroadcasti32x4 (%r11),%zmm14
|
|
|
|
|
|
vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12
|
|
|
|
|
|
vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11
|
|
|
|
|
|
|
|
cmpq $256,%rdx
|
|
jb .Lcrypt_loop_4x_done__func2
|
|
|
|
|
|
vmovdqu8 256-256(%r9),%zmm27
|
|
vmovdqu8 256-192(%r9),%zmm28
|
|
vmovdqu8 256-128(%r9),%zmm29
|
|
vmovdqu8 256-64(%r9),%zmm30
|
|
|
|
vbroadcasti32x4 -144(%r11),%zmm15
|
|
vbroadcasti32x4 -128(%r11),%zmm16
|
|
vbroadcasti32x4 -112(%r11),%zmm17
|
|
vbroadcasti32x4 -96(%r11),%zmm18
|
|
vbroadcasti32x4 -80(%r11),%zmm19
|
|
vbroadcasti32x4 -64(%r11),%zmm20
|
|
vbroadcasti32x4 -48(%r11),%zmm21
|
|
vbroadcasti32x4 -32(%r11),%zmm22
|
|
vbroadcasti32x4 -16(%r11),%zmm23
|
|
|
|
.Lcrypt_loop_4x__func2:
|
|
vmovdqu8 0(%rdi),%zmm4
|
|
vmovdqu8 64(%rdi),%zmm5
|
|
vmovdqu8 128(%rdi),%zmm6
|
|
vmovdqu8 192(%rdi),%zmm7
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm1
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm2
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpshufb %zmm8,%zmm12,%zmm3
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
|
|
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
vpxord %zmm13,%zmm1,%zmm1
|
|
vpxord %zmm13,%zmm2,%zmm2
|
|
vpxord %zmm13,%zmm3,%zmm3
|
|
|
|
cmpl $24,%r10d
|
|
jl .Laes128__func2
|
|
je .Laes192__func2
|
|
|
|
vbroadcasti32x4 -208(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
vbroadcasti32x4 -192(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
.Laes192__func2:
|
|
vbroadcasti32x4 -176(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
vbroadcasti32x4 -160(%r11),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
vaesenc %zmm9,%zmm1,%zmm1
|
|
vaesenc %zmm9,%zmm2,%zmm2
|
|
vaesenc %zmm9,%zmm3,%zmm3
|
|
|
|
.Laes128__func2:
|
|
|
|
|
|
|
|
|
|
prefetcht0 512+0(%rdi)
|
|
prefetcht0 512+64(%rdi)
|
|
prefetcht0 512+128(%rdi)
|
|
prefetcht0 512+192(%rdi)
|
|
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm4,%zmm4
|
|
vpxord %zmm10,%zmm4,%zmm4
|
|
vpshufb %zmm8,%zmm5,%zmm5
|
|
vpshufb %zmm8,%zmm6,%zmm6
|
|
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm1,%zmm1
|
|
vaesenc %zmm15,%zmm2,%zmm2
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
|
|
vpshufb %zmm8,%zmm7,%zmm7
|
|
vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10
|
|
vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24
|
|
vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25
|
|
|
|
vaesenc %zmm16,%zmm0,%zmm0
|
|
vaesenc %zmm16,%zmm1,%zmm1
|
|
vaesenc %zmm16,%zmm2,%zmm2
|
|
vaesenc %zmm16,%zmm3,%zmm3
|
|
|
|
vpxord %zmm24,%zmm10,%zmm10
|
|
vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm10
|
|
vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24
|
|
|
|
vaesenc %zmm17,%zmm0,%zmm0
|
|
vaesenc %zmm17,%zmm1,%zmm1
|
|
vaesenc %zmm17,%zmm2,%zmm2
|
|
vaesenc %zmm17,%zmm3,%zmm3
|
|
|
|
vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25
|
|
|
|
vaesenc %zmm18,%zmm0,%zmm0
|
|
vaesenc %zmm18,%zmm1,%zmm1
|
|
vaesenc %zmm18,%zmm2,%zmm2
|
|
vaesenc %zmm18,%zmm3,%zmm3
|
|
|
|
vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25
|
|
vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26
|
|
|
|
vaesenc %zmm19,%zmm0,%zmm0
|
|
vaesenc %zmm19,%zmm1,%zmm1
|
|
vaesenc %zmm19,%zmm2,%zmm2
|
|
vaesenc %zmm19,%zmm3,%zmm3
|
|
|
|
vpternlogd $0x96,%zmm26,%zmm25,%zmm24
|
|
vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26
|
|
vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25
|
|
vpxord %zmm25,%zmm24,%zmm24
|
|
|
|
vaesenc %zmm20,%zmm0,%zmm0
|
|
vaesenc %zmm20,%zmm1,%zmm1
|
|
vaesenc %zmm20,%zmm2,%zmm2
|
|
vaesenc %zmm20,%zmm3,%zmm3
|
|
|
|
vpshufd $0x4e,%zmm10,%zmm10
|
|
vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4
|
|
vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5
|
|
vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6
|
|
|
|
vaesenc %zmm21,%zmm0,%zmm0
|
|
vaesenc %zmm21,%zmm1,%zmm1
|
|
vaesenc %zmm21,%zmm2,%zmm2
|
|
vaesenc %zmm21,%zmm3,%zmm3
|
|
|
|
vpternlogd $0x96,%zmm26,%zmm10,%zmm24
|
|
vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7
|
|
vpternlogd $0x96,%zmm6,%zmm5,%zmm4
|
|
vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25
|
|
|
|
vaesenc %zmm22,%zmm0,%zmm0
|
|
vaesenc %zmm22,%zmm1,%zmm1
|
|
vaesenc %zmm22,%zmm2,%zmm2
|
|
vaesenc %zmm22,%zmm3,%zmm3
|
|
|
|
vpxord %zmm7,%zmm4,%zmm10
|
|
vpshufd $0x4e,%zmm24,%zmm24
|
|
vpternlogd $0x96,%zmm25,%zmm24,%zmm10
|
|
|
|
vaesenc %zmm23,%zmm0,%zmm0
|
|
vaesenc %zmm23,%zmm1,%zmm1
|
|
vaesenc %zmm23,%zmm2,%zmm2
|
|
vaesenc %zmm23,%zmm3,%zmm3
|
|
|
|
|
|
vextracti32x4 $1,%zmm10,%xmm4
|
|
vextracti32x4 $2,%zmm10,%xmm5
|
|
vextracti32x4 $3,%zmm10,%xmm6
|
|
vpxord %xmm4,%xmm10,%xmm10
|
|
vpternlogd $0x96,%xmm5,%xmm6,%xmm10
|
|
|
|
vpxord 0(%rdi),%zmm14,%zmm4
|
|
vpxord 64(%rdi),%zmm14,%zmm5
|
|
vpxord 128(%rdi),%zmm14,%zmm6
|
|
vpxord 192(%rdi),%zmm14,%zmm7
|
|
vaesenclast %zmm4,%zmm0,%zmm4
|
|
vaesenclast %zmm5,%zmm1,%zmm5
|
|
vaesenclast %zmm6,%zmm2,%zmm6
|
|
vaesenclast %zmm7,%zmm3,%zmm7
|
|
vmovdqu8 %zmm4,0(%rsi)
|
|
vmovdqu8 %zmm5,64(%rsi)
|
|
vmovdqu8 %zmm6,128(%rsi)
|
|
vmovdqu8 %zmm7,192(%rsi)
|
|
|
|
addq $256,%rdi
|
|
addq $256,%rsi
|
|
subq $256,%rdx
|
|
cmpq $256,%rdx
|
|
jae .Lcrypt_loop_4x__func2
|
|
.Lcrypt_loop_4x_done__func2:
|
|
|
|
testq %rdx,%rdx
|
|
jz .Ldone__func2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %rdx,%rax
|
|
negq %rax
|
|
andq $-16,%rax
|
|
leaq 256(%r9,%rax,1),%r8
|
|
vpxor %xmm4,%xmm4,%xmm4
|
|
vpxor %xmm5,%xmm5,%xmm5
|
|
vpxor %xmm6,%xmm6,%xmm6
|
|
|
|
cmpq $64,%rdx
|
|
jb .Lpartial_vec__func2
|
|
|
|
.Lcrypt_loop_1x__func2:
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpaddd %zmm11,%zmm12,%zmm12
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
leaq 16(%rcx),%rax
|
|
.Lvaesenc_loop_tail_full_vec__func2:
|
|
vbroadcasti32x4 (%rax),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
addq $16,%rax
|
|
cmpq %rax,%r11
|
|
jne .Lvaesenc_loop_tail_full_vec__func2
|
|
vaesenclast %zmm14,%zmm0,%zmm0
|
|
|
|
|
|
vmovdqu8 (%rdi),%zmm1
|
|
vpxord %zmm1,%zmm0,%zmm0
|
|
vmovdqu8 %zmm0,(%rsi)
|
|
|
|
|
|
vmovdqu8 (%r8),%zmm30
|
|
vpshufb %zmm8,%zmm1,%zmm0
|
|
vpxord %zmm10,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
|
|
vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
|
|
vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
|
|
vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
|
|
vpxord %zmm7,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm2,%zmm1,%zmm5
|
|
vpxord %zmm3,%zmm6,%zmm6
|
|
|
|
vpxor %xmm10,%xmm10,%xmm10
|
|
|
|
addq $64,%r8
|
|
addq $64,%rdi
|
|
addq $64,%rsi
|
|
subq $64,%rdx
|
|
cmpq $64,%rdx
|
|
jae .Lcrypt_loop_1x__func2
|
|
|
|
testq %rdx,%rdx
|
|
jz .Lreduce__func2
|
|
|
|
.Lpartial_vec__func2:
|
|
|
|
|
|
|
|
|
|
movq $-1,%rax
|
|
bzhiq %rdx,%rax,%rax
|
|
kmovq %rax,%k1
|
|
addq $15,%rdx
|
|
andq $-16,%rdx
|
|
movq $-1,%rax
|
|
bzhiq %rdx,%rax,%rax
|
|
kmovq %rax,%k2
|
|
|
|
|
|
|
|
vpshufb %zmm8,%zmm12,%zmm0
|
|
vpxord %zmm13,%zmm0,%zmm0
|
|
leaq 16(%rcx),%rax
|
|
.Lvaesenc_loop_tail_partialvec__func2:
|
|
vbroadcasti32x4 (%rax),%zmm9
|
|
vaesenc %zmm9,%zmm0,%zmm0
|
|
addq $16,%rax
|
|
cmpq %rax,%r11
|
|
jne .Lvaesenc_loop_tail_partialvec__func2
|
|
vaesenclast %zmm14,%zmm0,%zmm0
|
|
|
|
|
|
vmovdqu8 (%rdi),%zmm1{%k1}{z}
|
|
vpxord %zmm1,%zmm0,%zmm0
|
|
vmovdqu8 %zmm0,(%rsi){%k1}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovdqu8 (%r8),%zmm30{%k2}{z}
|
|
|
|
vpshufb %zmm8,%zmm1,%zmm0
|
|
vpxord %zmm10,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7
|
|
vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1
|
|
vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2
|
|
vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3
|
|
vpxord %zmm7,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm2,%zmm1,%zmm5
|
|
vpxord %zmm3,%zmm6,%zmm6
|
|
|
|
|
|
.Lreduce__func2:
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0
|
|
vpshufd $0x4e,%zmm4,%zmm4
|
|
vpternlogd $0x96,%zmm0,%zmm4,%zmm5
|
|
vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0
|
|
vpshufd $0x4e,%zmm5,%zmm5
|
|
vpternlogd $0x96,%zmm0,%zmm5,%zmm6
|
|
|
|
vextracti32x4 $1,%zmm6,%xmm0
|
|
vextracti32x4 $2,%zmm6,%xmm1
|
|
vextracti32x4 $3,%zmm6,%xmm2
|
|
vpxord %xmm0,%xmm6,%xmm10
|
|
vpternlogd $0x96,%xmm1,%xmm2,%xmm10
|
|
|
|
|
|
.Ldone__func2:
|
|
|
|
vpshufb %xmm8,%xmm10,%xmm10
|
|
vmovdqu %xmm10,(%r12)
|
|
|
|
vzeroupper
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size aes_gcm_dec_update_vaes_avx512, . - aes_gcm_dec_update_vaes_avx512
|
|
#endif
|