136132 lines
3.3 MiB
136132 lines
3.3 MiB
|
|
.globl ossl_vaes_vpclmulqdq_capable
|
|
.type ossl_vaes_vpclmulqdq_capable,@function
|
|
.align 32
|
|
ossl_vaes_vpclmulqdq_capable:
|
|
movq OPENSSL_ia32cap_P+8(%rip),%rcx
|
|
|
|
movq $6600291188736,%rdx
|
|
xorl %eax,%eax
|
|
andq %rdx,%rcx
|
|
cmpq %rdx,%rcx
|
|
cmoveq %rcx,%rax
|
|
.byte 0xf3,0xc3
|
|
.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
|
|
.text
|
|
.globl ossl_aes_gcm_init_avx512
|
|
.type ossl_aes_gcm_init_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_init_avx512:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
vpxorq %xmm16,%xmm16,%xmm16
|
|
|
|
|
|
movl 240(%rdi),%eax
|
|
cmpl $9,%eax
|
|
je .Laes_128_0
|
|
cmpl $11,%eax
|
|
je .Laes_192_0
|
|
cmpl $13,%eax
|
|
je .Laes_256_0
|
|
jmp .Lexit_aes_0
|
|
.align 32
|
|
.Laes_128_0:
|
|
vpxorq 0(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 16(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 32(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 48(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 64(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 80(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 96(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 112(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 128(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 144(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenclast 160(%rdi),%xmm16,%xmm16
|
|
jmp .Lexit_aes_0
|
|
.align 32
|
|
.Laes_192_0:
|
|
vpxorq 0(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 16(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 32(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 48(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 64(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 80(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 96(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 112(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 128(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 144(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 160(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 176(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenclast 192(%rdi),%xmm16,%xmm16
|
|
jmp .Lexit_aes_0
|
|
.align 32
|
|
.Laes_256_0:
|
|
vpxorq 0(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 16(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 32(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 48(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 64(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 80(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 96(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 112(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 128(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 144(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 160(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 176(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 192(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenc 208(%rdi),%xmm16,%xmm16
|
|
|
|
vaesenclast 224(%rdi),%xmm16,%xmm16
|
|
jmp .Lexit_aes_0
|
|
.Lexit_aes_0:
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
|
|
|
|
vmovdqa64 %xmm16,%xmm2
|
|
vpsllq $1,%xmm16,%xmm16
|
|
vpsrlq $63,%xmm2,%xmm2
|
|
vmovdqa %xmm2,%xmm1
|
|
vpslldq $8,%xmm2,%xmm2
|
|
vpsrldq $8,%xmm1,%xmm1
|
|
vporq %xmm2,%xmm16,%xmm16
|
|
|
|
vpshufd $36,%xmm1,%xmm2
|
|
vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
|
|
vpand POLY(%rip),%xmm2,%xmm2
|
|
vpxorq %xmm2,%xmm16,%xmm16
|
|
|
|
vmovdqu64 %xmm16,336(%rsi)
|
|
vshufi32x4 $0x00,%ymm16,%ymm16,%ymm4
|
|
vmovdqa %ymm4,%ymm3
|
|
|
|
vpclmulqdq $0x11,%ymm4,%ymm3,%ymm0
|
|
vpclmulqdq $0x00,%ymm4,%ymm3,%ymm1
|
|
vpclmulqdq $0x01,%ymm4,%ymm3,%ymm2
|
|
vpclmulqdq $0x10,%ymm4,%ymm3,%ymm3
|
|
vpxorq %ymm2,%ymm3,%ymm3
|
|
|
|
vpsrldq $8,%ymm3,%ymm2
|
|
vpslldq $8,%ymm3,%ymm3
|
|
vpxorq %ymm2,%ymm0,%ymm0
|
|
vpxorq %ymm1,%ymm3,%ymm3
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%ymm2
|
|
|
|
vpclmulqdq $0x01,%ymm3,%ymm2,%ymm1
|
|
vpslldq $8,%ymm1,%ymm1
|
|
vpxorq %ymm1,%ymm3,%ymm3
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%ymm3,%ymm2,%ymm1
|
|
vpsrldq $4,%ymm1,%ymm1
|
|
vpclmulqdq $0x10,%ymm3,%ymm2,%ymm3
|
|
vpslldq $4,%ymm3,%ymm3
|
|
|
|
vpternlogq $0x96,%ymm1,%ymm0,%ymm3
|
|
|
|
vmovdqu64 %xmm3,320(%rsi)
|
|
vinserti64x2 $1,%xmm16,%ymm3,%ymm4
|
|
vmovdqa64 %ymm4,%ymm5
|
|
|
|
vpclmulqdq $0x11,%ymm3,%ymm4,%ymm0
|
|
vpclmulqdq $0x00,%ymm3,%ymm4,%ymm1
|
|
vpclmulqdq $0x01,%ymm3,%ymm4,%ymm2
|
|
vpclmulqdq $0x10,%ymm3,%ymm4,%ymm4
|
|
vpxorq %ymm2,%ymm4,%ymm4
|
|
|
|
vpsrldq $8,%ymm4,%ymm2
|
|
vpslldq $8,%ymm4,%ymm4
|
|
vpxorq %ymm2,%ymm0,%ymm0
|
|
vpxorq %ymm1,%ymm4,%ymm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%ymm2
|
|
|
|
vpclmulqdq $0x01,%ymm4,%ymm2,%ymm1
|
|
vpslldq $8,%ymm1,%ymm1
|
|
vpxorq %ymm1,%ymm4,%ymm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%ymm4,%ymm2,%ymm1
|
|
vpsrldq $4,%ymm1,%ymm1
|
|
vpclmulqdq $0x10,%ymm4,%ymm2,%ymm4
|
|
vpslldq $4,%ymm4,%ymm4
|
|
|
|
vpternlogq $0x96,%ymm1,%ymm0,%ymm4
|
|
|
|
vmovdqu64 %ymm4,288(%rsi)
|
|
|
|
vinserti64x4 $1,%ymm5,%zmm4,%zmm4
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3
|
|
vmovdqa64 %zmm4,%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm2
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm0,%zmm0
|
|
vpxorq %zmm1,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm2
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1
|
|
vpslldq $8,%zmm1,%zmm1
|
|
vpxorq %zmm1,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1
|
|
vpsrldq $4,%zmm1,%zmm1
|
|
vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm1,%zmm0,%zmm4
|
|
|
|
vmovdqu64 %zmm4,224(%rsi)
|
|
vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm0
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm1
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm2
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm2,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm2
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm2,%zmm0,%zmm0
|
|
vpxorq %zmm1,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm2
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm2,%zmm1
|
|
vpslldq $8,%zmm1,%zmm1
|
|
vpxorq %zmm1,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm2,%zmm1
|
|
vpsrldq $4,%zmm1,%zmm1
|
|
vpclmulqdq $0x10,%zmm5,%zmm2,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm1,%zmm0,%zmm5
|
|
|
|
vmovdqu64 %zmm5,160(%rsi)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm2
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm0,%zmm0
|
|
vpxorq %zmm1,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm2
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1
|
|
vpslldq $8,%zmm1,%zmm1
|
|
vpxorq %zmm1,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1
|
|
vpsrldq $4,%zmm1,%zmm1
|
|
vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm1,%zmm0,%zmm4
|
|
|
|
vmovdqu64 %zmm4,96(%rsi)
|
|
vzeroupper
|
|
.Labort_init:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
|
|
.globl ossl_aes_gcm_setiv_avx512
|
|
.type ossl_aes_gcm_setiv_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_setiv_avx512:
|
|
.cfi_startproc
|
|
.Lsetiv_seh_begin:
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
.Lsetiv_seh_push_rbx:
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
.Lsetiv_seh_push_rbp:
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
.Lsetiv_seh_push_r12:
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
.Lsetiv_seh_push_r13:
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
.Lsetiv_seh_push_r14:
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Lsetiv_seh_push_r15:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaq 0(%rsp),%rbp
|
|
.cfi_def_cfa_register %rbp
|
|
.Lsetiv_seh_setfp:
|
|
|
|
.Lsetiv_seh_prolog_end:
|
|
subq $820,%rsp
|
|
andq $(-64),%rsp
|
|
cmpq $12,%rcx
|
|
je iv_len_12_init_IV
|
|
vpxor %xmm2,%xmm2,%xmm2
|
|
movq %rdx,%r10
|
|
movq %rcx,%r11
|
|
orq %r11,%r11
|
|
jz .L_CALC_AAD_done_1
|
|
|
|
xorq %rbx,%rbx
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm16
|
|
|
|
.L_get_AAD_loop48x16_1:
|
|
cmpq $768,%r11
|
|
jl .L_exit_AAD_loop48x16_1
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
testq %rbx,%rbx
|
|
jnz .L_skip_hkeys_precomputation_2
|
|
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vmovdqu64 %zmm1,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm9
|
|
vmovdqu64 %zmm9,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
|
|
|
|
vmovdqu64 160(%rsi),%zmm10
|
|
vmovdqu64 %zmm10,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm12
|
|
vmovdqu64 %zmm12,512(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,0(%rsp)
|
|
.L_skip_hkeys_precomputation_2:
|
|
movq $1,%rbx
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 0(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 64(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 128(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 192(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 256(%r10),%zmm11
|
|
vmovdqu64 320(%r10),%zmm3
|
|
vmovdqu64 384(%r10),%zmm4
|
|
vmovdqu64 448(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 256(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 320(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 384(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 448(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 512(%r10),%zmm11
|
|
vmovdqu64 576(%r10),%zmm3
|
|
vmovdqu64 640(%r10),%zmm4
|
|
vmovdqu64 704(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 512(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 576(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 640(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 704(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm2
|
|
|
|
subq $768,%r11
|
|
je .L_CALC_AAD_done_1
|
|
|
|
addq $768,%r10
|
|
jmp .L_get_AAD_loop48x16_1
|
|
|
|
.L_exit_AAD_loop48x16_1:
|
|
|
|
cmpq $512,%r11
|
|
jl .L_less_than_32x16_1
|
|
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
testq %rbx,%rbx
|
|
jnz .L_skip_hkeys_precomputation_3
|
|
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vmovdqu64 %zmm1,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm9
|
|
vmovdqu64 %zmm9,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
|
|
|
|
vmovdqu64 160(%rsi),%zmm10
|
|
vmovdqu64 %zmm10,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm12
|
|
vmovdqu64 %zmm12,512(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,256(%rsp)
|
|
.L_skip_hkeys_precomputation_3:
|
|
movq $1,%rbx
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 256(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 320(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 384(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 448(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 256(%r10),%zmm11
|
|
vmovdqu64 320(%r10),%zmm3
|
|
vmovdqu64 384(%r10),%zmm4
|
|
vmovdqu64 448(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 512(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 576(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 640(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 704(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm2
|
|
|
|
subq $512,%r11
|
|
je .L_CALC_AAD_done_1
|
|
|
|
addq $512,%r10
|
|
jmp .L_less_than_16x16_1
|
|
|
|
.L_less_than_32x16_1:
|
|
cmpq $256,%r11
|
|
jl .L_less_than_16x16_1
|
|
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 96(%rsi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 160(%rsi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 224(%rsi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 288(%rsi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm2
|
|
|
|
subq $256,%r11
|
|
je .L_CALC_AAD_done_1
|
|
|
|
addq $256,%r10
|
|
|
|
.L_less_than_16x16_1:
|
|
|
|
leaq byte64_len_to_mask_table(%rip),%r12
|
|
leaq (%r12,%r11,8),%r12
|
|
|
|
|
|
addl $15,%r11d
|
|
shrl $4,%r11d
|
|
cmpl $2,%r11d
|
|
jb .L_AAD_blocks_1_1
|
|
je .L_AAD_blocks_2_1
|
|
cmpl $4,%r11d
|
|
jb .L_AAD_blocks_3_1
|
|
je .L_AAD_blocks_4_1
|
|
cmpl $6,%r11d
|
|
jb .L_AAD_blocks_5_1
|
|
je .L_AAD_blocks_6_1
|
|
cmpl $8,%r11d
|
|
jb .L_AAD_blocks_7_1
|
|
je .L_AAD_blocks_8_1
|
|
cmpl $10,%r11d
|
|
jb .L_AAD_blocks_9_1
|
|
je .L_AAD_blocks_10_1
|
|
cmpl $12,%r11d
|
|
jb .L_AAD_blocks_11_1
|
|
je .L_AAD_blocks_12_1
|
|
cmpl $14,%r11d
|
|
jb .L_AAD_blocks_13_1
|
|
je .L_AAD_blocks_14_1
|
|
cmpl $15,%r11d
|
|
je .L_AAD_blocks_15_1
|
|
.L_AAD_blocks_16_1:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%zmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 96(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 160(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 224(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm9,%zmm11,%zmm1
|
|
vpternlogq $0x96,%zmm10,%zmm3,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm12,%zmm11,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm3,%zmm8
|
|
vmovdqu64 288(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_15_1:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%zmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 112(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 176(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 240(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 304(%rsi),%ymm15
|
|
vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_14_1:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%ymm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %ymm16,%ymm5,%ymm5
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 128(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 192(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 256(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 320(%rsi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_13_1:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%xmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %xmm16,%xmm5,%xmm5
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 144(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 208(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 272(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 336(%rsi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_12_1:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 160(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 224(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 288(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_11_1:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 176(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 240(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 304(%rsi),%ymm15
|
|
vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_10_1:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%ymm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %ymm16,%ymm4,%ymm4
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 192(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 256(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 320(%rsi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_9_1:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%xmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %xmm16,%xmm4,%xmm4
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 208(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 272(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 336(%rsi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_8_1:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 224(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 288(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_7_1:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 240(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 304(%rsi),%ymm15
|
|
vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_6_1:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%ymm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %ymm16,%ymm3,%ymm3
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 256(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 320(%rsi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_5_1:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%xmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %xmm16,%xmm3,%xmm3
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 272(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 336(%rsi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_4_1:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 288(%rsi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_3_1:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 304(%rsi),%ymm15
|
|
vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_2_1:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%ymm11{%k1}{z}
|
|
vpshufb %ymm16,%ymm11,%ymm11
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 320(%rsi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
jmp .L_CALC_AAD_done_1
|
|
.L_AAD_blocks_1_1:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%xmm11{%k1}{z}
|
|
vpshufb %xmm16,%xmm11,%xmm11
|
|
vpxorq %zmm2,%zmm11,%zmm11
|
|
vmovdqu64 336(%rsi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm2
|
|
|
|
.L_CALC_AAD_done_1:
|
|
movq %rcx,%r10
|
|
shlq $3,%r10
|
|
vmovq %r10,%xmm3
|
|
|
|
|
|
vpxorq %xmm2,%xmm3,%xmm2
|
|
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
|
|
vpclmulqdq $0x11,%xmm1,%xmm2,%xmm11
|
|
vpclmulqdq $0x00,%xmm1,%xmm2,%xmm3
|
|
vpclmulqdq $0x01,%xmm1,%xmm2,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm2,%xmm2
|
|
vpxorq %xmm4,%xmm2,%xmm2
|
|
|
|
vpsrldq $8,%xmm2,%xmm4
|
|
vpslldq $8,%xmm2,%xmm2
|
|
vpxorq %xmm4,%xmm11,%xmm11
|
|
vpxorq %xmm3,%xmm2,%xmm2
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm4
|
|
|
|
vpclmulqdq $0x01,%xmm2,%xmm4,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm2,%xmm2
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm2,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm2,%xmm4,%xmm2
|
|
vpslldq $4,%xmm2,%xmm2
|
|
|
|
vpternlogq $0x96,%xmm3,%xmm11,%xmm2
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm2,%xmm2
|
|
jmp skip_iv_len_12_init_IV
|
|
iv_len_12_init_IV:
|
|
|
|
vmovdqu8 ONEf(%rip),%xmm2
|
|
movq %rdx,%r11
|
|
movl $0x0000000000000fff,%r10d
|
|
kmovq %r10,%k1
|
|
vmovdqu8 (%r11),%xmm2{%k1}
|
|
skip_iv_len_12_init_IV:
|
|
vmovdqu %xmm2,%xmm1
|
|
|
|
|
|
movl 240(%rdi),%r10d
|
|
cmpl $9,%r10d
|
|
je .Laes_128_4
|
|
cmpl $11,%r10d
|
|
je .Laes_192_4
|
|
cmpl $13,%r10d
|
|
je .Laes_256_4
|
|
jmp .Lexit_aes_4
|
|
.align 32
|
|
.Laes_128_4:
|
|
vpxorq 0(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 16(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 32(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 48(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 64(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 80(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 96(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 112(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 128(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 144(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenclast 160(%rdi),%xmm1,%xmm1
|
|
jmp .Lexit_aes_4
|
|
.align 32
|
|
.Laes_192_4:
|
|
vpxorq 0(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 16(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 32(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 48(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 64(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 80(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 96(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 112(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 128(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 144(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 160(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 176(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenclast 192(%rdi),%xmm1,%xmm1
|
|
jmp .Lexit_aes_4
|
|
.align 32
|
|
.Laes_256_4:
|
|
vpxorq 0(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 16(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 32(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 48(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 64(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 80(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 96(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 112(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 128(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 144(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 160(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 176(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 192(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenc 208(%rdi),%xmm1,%xmm1
|
|
|
|
vaesenclast 224(%rdi),%xmm1,%xmm1
|
|
jmp .Lexit_aes_4
|
|
.Lexit_aes_4:
|
|
|
|
vmovdqu %xmm1,32(%rsi)
|
|
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm2,%xmm2
|
|
vmovdqu %xmm2,0(%rsi)
|
|
cmpq $256,%rcx
|
|
jbe .Lskip_hkeys_cleanup_5
|
|
vpxor %xmm0,%xmm0,%xmm0
|
|
vmovdqa64 %zmm0,0(%rsp)
|
|
vmovdqa64 %zmm0,64(%rsp)
|
|
vmovdqa64 %zmm0,128(%rsp)
|
|
vmovdqa64 %zmm0,192(%rsp)
|
|
vmovdqa64 %zmm0,256(%rsp)
|
|
vmovdqa64 %zmm0,320(%rsp)
|
|
vmovdqa64 %zmm0,384(%rsp)
|
|
vmovdqa64 %zmm0,448(%rsp)
|
|
vmovdqa64 %zmm0,512(%rsp)
|
|
vmovdqa64 %zmm0,576(%rsp)
|
|
vmovdqa64 %zmm0,640(%rsp)
|
|
vmovdqa64 %zmm0,704(%rsp)
|
|
.Lskip_hkeys_cleanup_5:
|
|
vzeroupper
|
|
leaq (%rbp),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
popq %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r15
|
|
popq %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r14
|
|
popq %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r13
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
popq %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbp
|
|
popq %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbx
|
|
.Labort_setiv:
|
|
.byte 0xf3,0xc3
|
|
.Lsetiv_seh_end:
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
|
|
.globl ossl_aes_gcm_update_aad_avx512
|
|
.type ossl_aes_gcm_update_aad_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_update_aad_avx512:
|
|
.cfi_startproc
|
|
.Lghash_seh_begin:
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
.Lghash_seh_push_rbx:
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
.Lghash_seh_push_rbp:
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
.Lghash_seh_push_r12:
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
.Lghash_seh_push_r13:
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
.Lghash_seh_push_r14:
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Lghash_seh_push_r15:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaq 0(%rsp),%rbp
|
|
.cfi_def_cfa_register %rbp
|
|
.Lghash_seh_setfp:
|
|
|
|
.Lghash_seh_prolog_end:
|
|
subq $820,%rsp
|
|
andq $(-64),%rsp
|
|
vmovdqu64 64(%rdi),%xmm14
|
|
movq %rsi,%r10
|
|
movq %rdx,%r11
|
|
orq %r11,%r11
|
|
jz .L_CALC_AAD_done_6
|
|
|
|
xorq %rbx,%rbx
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm16
|
|
|
|
.L_get_AAD_loop48x16_6:
|
|
cmpq $768,%r11
|
|
jl .L_exit_AAD_loop48x16_6
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
testq %rbx,%rbx
|
|
jnz .L_skip_hkeys_precomputation_7
|
|
|
|
vmovdqu64 288(%rdi),%zmm1
|
|
vmovdqu64 %zmm1,704(%rsp)
|
|
|
|
vmovdqu64 224(%rdi),%zmm9
|
|
vmovdqu64 %zmm9,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
|
|
|
|
vmovdqu64 160(%rdi),%zmm10
|
|
vmovdqu64 %zmm10,576(%rsp)
|
|
|
|
vmovdqu64 96(%rdi),%zmm12
|
|
vmovdqu64 %zmm12,512(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,0(%rsp)
|
|
.L_skip_hkeys_precomputation_7:
|
|
movq $1,%rbx
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 0(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 64(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 128(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 192(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 256(%r10),%zmm11
|
|
vmovdqu64 320(%r10),%zmm3
|
|
vmovdqu64 384(%r10),%zmm4
|
|
vmovdqu64 448(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 256(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 320(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 384(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 448(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 512(%r10),%zmm11
|
|
vmovdqu64 576(%r10),%zmm3
|
|
vmovdqu64 640(%r10),%zmm4
|
|
vmovdqu64 704(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 512(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 576(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 640(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 704(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm14
|
|
|
|
subq $768,%r11
|
|
je .L_CALC_AAD_done_6
|
|
|
|
addq $768,%r10
|
|
jmp .L_get_AAD_loop48x16_6
|
|
|
|
.L_exit_AAD_loop48x16_6:
|
|
|
|
cmpq $512,%r11
|
|
jl .L_less_than_32x16_6
|
|
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
testq %rbx,%rbx
|
|
jnz .L_skip_hkeys_precomputation_8
|
|
|
|
vmovdqu64 288(%rdi),%zmm1
|
|
vmovdqu64 %zmm1,704(%rsp)
|
|
|
|
vmovdqu64 224(%rdi),%zmm9
|
|
vmovdqu64 %zmm9,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
|
|
|
|
vmovdqu64 160(%rdi),%zmm10
|
|
vmovdqu64 %zmm10,576(%rsp)
|
|
|
|
vmovdqu64 96(%rdi),%zmm12
|
|
vmovdqu64 %zmm12,512(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm10,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm17
|
|
vpslldq $8,%zmm10,%zmm10
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm10,%zmm10
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
|
|
vpslldq $4,%zmm10,%zmm10
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm10
|
|
|
|
vmovdqu64 %zmm10,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
|
|
vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
|
|
vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
|
|
vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm12,%zmm12
|
|
|
|
vpsrldq $8,%zmm12,%zmm17
|
|
vpslldq $8,%zmm12,%zmm12
|
|
vpxorq %zmm17,%zmm13,%zmm13
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm17
|
|
|
|
vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
|
|
vpslldq $8,%zmm15,%zmm15
|
|
vpxorq %zmm15,%zmm12,%zmm12
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
|
|
vpsrldq $4,%zmm15,%zmm15
|
|
vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
|
|
vpslldq $4,%zmm12,%zmm12
|
|
|
|
vpternlogq $0x96,%zmm15,%zmm13,%zmm12
|
|
|
|
vmovdqu64 %zmm12,256(%rsp)
|
|
.L_skip_hkeys_precomputation_8:
|
|
movq $1,%rbx
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 256(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 320(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 384(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 448(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 256(%r10),%zmm11
|
|
vmovdqu64 320(%r10),%zmm3
|
|
vmovdqu64 384(%r10),%zmm4
|
|
vmovdqu64 448(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vmovdqu64 512(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 576(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 640(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 704(%rsp),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm14
|
|
|
|
subq $512,%r11
|
|
je .L_CALC_AAD_done_6
|
|
|
|
addq $512,%r10
|
|
jmp .L_less_than_16x16_6
|
|
|
|
.L_less_than_32x16_6:
|
|
cmpq $256,%r11
|
|
jl .L_less_than_16x16_6
|
|
|
|
vmovdqu64 0(%r10),%zmm11
|
|
vmovdqu64 64(%r10),%zmm3
|
|
vmovdqu64 128(%r10),%zmm4
|
|
vmovdqu64 192(%r10),%zmm5
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 96(%rdi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
|
|
vmovdqu64 160(%rdi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
|
|
vpxorq %zmm17,%zmm10,%zmm7
|
|
vpxorq %zmm13,%zmm1,%zmm6
|
|
vpxorq %zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
vmovdqu64 224(%rdi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
|
|
vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
|
|
vmovdqu64 288(%rdi),%zmm19
|
|
vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
|
|
vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
|
|
vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
|
|
vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm1,%zmm6
|
|
vpternlogq $0x96,%zmm15,%zmm9,%zmm8
|
|
vpternlogq $0x96,%zmm18,%zmm12,%zmm7
|
|
|
|
vpsrldq $8,%zmm7,%zmm1
|
|
vpslldq $8,%zmm7,%zmm9
|
|
vpxorq %zmm1,%zmm6,%zmm6
|
|
vpxorq %zmm9,%zmm8,%zmm8
|
|
vextracti64x4 $1,%zmm6,%ymm1
|
|
vpxorq %ymm1,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm1
|
|
vpxorq %xmm1,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm8,%ymm9
|
|
vpxorq %ymm9,%ymm8,%ymm8
|
|
vextracti32x4 $1,%ymm8,%xmm9
|
|
vpxorq %xmm9,%xmm8,%xmm8
|
|
vmovdqa64 POLY2(%rip),%xmm10
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm1,%xmm8,%xmm1
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
|
|
vpsrldq $4,%xmm9,%xmm9
|
|
vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm6,%xmm9,%xmm14
|
|
|
|
subq $256,%r11
|
|
je .L_CALC_AAD_done_6
|
|
|
|
addq $256,%r10
|
|
|
|
.L_less_than_16x16_6:
|
|
|
|
leaq byte64_len_to_mask_table(%rip),%r12
|
|
leaq (%r12,%r11,8),%r12
|
|
|
|
|
|
addl $15,%r11d
|
|
shrl $4,%r11d
|
|
cmpl $2,%r11d
|
|
jb .L_AAD_blocks_1_6
|
|
je .L_AAD_blocks_2_6
|
|
cmpl $4,%r11d
|
|
jb .L_AAD_blocks_3_6
|
|
je .L_AAD_blocks_4_6
|
|
cmpl $6,%r11d
|
|
jb .L_AAD_blocks_5_6
|
|
je .L_AAD_blocks_6_6
|
|
cmpl $8,%r11d
|
|
jb .L_AAD_blocks_7_6
|
|
je .L_AAD_blocks_8_6
|
|
cmpl $10,%r11d
|
|
jb .L_AAD_blocks_9_6
|
|
je .L_AAD_blocks_10_6
|
|
cmpl $12,%r11d
|
|
jb .L_AAD_blocks_11_6
|
|
je .L_AAD_blocks_12_6
|
|
cmpl $14,%r11d
|
|
jb .L_AAD_blocks_13_6
|
|
je .L_AAD_blocks_14_6
|
|
cmpl $15,%r11d
|
|
je .L_AAD_blocks_15_6
|
|
.L_AAD_blocks_16_6:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%zmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 96(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 160(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 224(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm9,%zmm11,%zmm1
|
|
vpternlogq $0x96,%zmm10,%zmm3,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm12,%zmm11,%zmm7
|
|
vpternlogq $0x96,%zmm13,%zmm3,%zmm8
|
|
vmovdqu64 288(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_15_6:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%zmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %zmm16,%zmm5,%zmm5
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 112(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 176(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 240(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 304(%rdi),%ymm15
|
|
vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_14_6:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%ymm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %ymm16,%ymm5,%ymm5
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 128(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 192(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 256(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 320(%rdi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_13_6:
|
|
subq $1536,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4
|
|
vmovdqu8 192(%r10),%xmm5{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpshufb %xmm16,%xmm5,%xmm5
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 144(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 208(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 272(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
vmovdqu64 336(%rdi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_12_6:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 160(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 224(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vmovdqu64 288(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm1,%zmm11,%zmm9
|
|
vpternlogq $0x96,%zmm6,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
|
|
vpternlogq $0x96,%zmm7,%zmm11,%zmm12
|
|
vpternlogq $0x96,%zmm8,%zmm3,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_11_6:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%zmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %zmm16,%zmm4,%zmm4
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 176(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 240(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 304(%rdi),%ymm15
|
|
vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_10_6:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%ymm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %ymm16,%ymm4,%ymm4
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 192(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 256(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 320(%rdi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_9_6:
|
|
subq $1024,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3
|
|
vmovdqu8 128(%r10),%xmm4{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpshufb %xmm16,%xmm4,%xmm4
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 208(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 272(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
vmovdqu64 336(%rdi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_8_6:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 224(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vmovdqu64 288(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
|
|
vpxorq %zmm9,%zmm1,%zmm9
|
|
vpxorq %zmm10,%zmm6,%zmm10
|
|
vpxorq %zmm12,%zmm7,%zmm12
|
|
vpxorq %zmm13,%zmm8,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_7_6:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%zmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %zmm16,%zmm3,%zmm3
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 240(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 304(%rdi),%ymm15
|
|
vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_6_6:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%ymm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %ymm16,%ymm3,%ymm3
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 256(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 320(%rdi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_5_6:
|
|
subq $512,%r12
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11
|
|
vmovdqu8 64(%r10),%xmm3{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpshufb %xmm16,%xmm3,%xmm3
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 272(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
vmovdqu64 336(%rdi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6
|
|
|
|
vpxorq %zmm12,%zmm7,%zmm7
|
|
vpxorq %zmm13,%zmm8,%zmm8
|
|
vpxorq %zmm9,%zmm1,%zmm1
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_4_6:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 288(%rdi),%zmm15
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
|
|
|
|
vpxorq %zmm13,%zmm12,%zmm12
|
|
vpsrldq $8,%zmm12,%zmm7
|
|
vpslldq $8,%zmm12,%zmm8
|
|
vpxorq %zmm7,%zmm9,%zmm1
|
|
vpxorq %zmm8,%zmm10,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_3_6:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%zmm11{%k1}{z}
|
|
vpshufb %zmm16,%zmm11,%zmm11
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 304(%rdi),%ymm15
|
|
vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
|
|
vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
|
|
vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
|
|
vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
|
|
vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_2_6:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%ymm11{%k1}{z}
|
|
vpshufb %ymm16,%ymm11,%ymm11
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 320(%rdi),%ymm15
|
|
vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7
|
|
vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8
|
|
vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1
|
|
vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
jmp .L_CALC_AAD_done_6
|
|
.L_AAD_blocks_1_6:
|
|
kmovq (%r12),%k1
|
|
vmovdqu8 0(%r10),%xmm11{%k1}{z}
|
|
vpshufb %xmm16,%xmm11,%xmm11
|
|
vpxorq %zmm14,%zmm11,%zmm11
|
|
vmovdqu64 336(%rdi),%xmm15
|
|
vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7
|
|
vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8
|
|
vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1
|
|
vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6
|
|
|
|
vpxorq %zmm8,%zmm7,%zmm7
|
|
vpsrldq $8,%zmm7,%zmm12
|
|
vpslldq $8,%zmm7,%zmm13
|
|
vpxorq %zmm12,%zmm1,%zmm1
|
|
vpxorq %zmm13,%zmm6,%zmm6
|
|
vextracti64x4 $1,%zmm1,%ymm12
|
|
vpxorq %ymm12,%ymm1,%ymm1
|
|
vextracti32x4 $1,%ymm1,%xmm12
|
|
vpxorq %xmm12,%xmm1,%xmm1
|
|
vextracti64x4 $1,%zmm6,%ymm13
|
|
vpxorq %ymm13,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm13
|
|
vpxorq %xmm13,%xmm6,%xmm6
|
|
vmovdqa64 POLY2(%rip),%xmm15
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
|
|
vpslldq $8,%xmm7,%xmm7
|
|
vpxorq %xmm7,%xmm6,%xmm7
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
|
|
vpsrldq $4,%xmm8,%xmm8
|
|
vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm1,%xmm8,%xmm14
|
|
|
|
.L_CALC_AAD_done_6:
|
|
vmovdqu64 %xmm14,64(%rdi)
|
|
cmpq $256,%rdx
|
|
jbe .Lskip_hkeys_cleanup_9
|
|
vpxor %xmm0,%xmm0,%xmm0
|
|
vmovdqa64 %zmm0,0(%rsp)
|
|
vmovdqa64 %zmm0,64(%rsp)
|
|
vmovdqa64 %zmm0,128(%rsp)
|
|
vmovdqa64 %zmm0,192(%rsp)
|
|
vmovdqa64 %zmm0,256(%rsp)
|
|
vmovdqa64 %zmm0,320(%rsp)
|
|
vmovdqa64 %zmm0,384(%rsp)
|
|
vmovdqa64 %zmm0,448(%rsp)
|
|
vmovdqa64 %zmm0,512(%rsp)
|
|
vmovdqa64 %zmm0,576(%rsp)
|
|
vmovdqa64 %zmm0,640(%rsp)
|
|
vmovdqa64 %zmm0,704(%rsp)
|
|
.Lskip_hkeys_cleanup_9:
|
|
vzeroupper
|
|
leaq (%rbp),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
popq %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r15
|
|
popq %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r14
|
|
popq %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r13
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
popq %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbp
|
|
popq %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbx
|
|
.Lexit_update_aad:
|
|
.byte 0xf3,0xc3
|
|
.Lghash_seh_end:
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
|
|
.globl ossl_aes_gcm_encrypt_avx512
|
|
.type ossl_aes_gcm_encrypt_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_encrypt_avx512:
|
|
.cfi_startproc
|
|
.Lencrypt_seh_begin:
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
.Lencrypt_seh_push_rbx:
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
.Lencrypt_seh_push_rbp:
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
.Lencrypt_seh_push_r12:
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
.Lencrypt_seh_push_r13:
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
.Lencrypt_seh_push_r14:
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Lencrypt_seh_push_r15:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaq 0(%rsp),%rbp
|
|
.cfi_def_cfa_register %rbp
|
|
.Lencrypt_seh_setfp:
|
|
|
|
.Lencrypt_seh_prolog_end:
|
|
subq $1588,%rsp
|
|
andq $(-64),%rsp
|
|
|
|
|
|
movl 240(%rdi),%eax
|
|
cmpl $9,%eax
|
|
je .Laes_gcm_encrypt_128_avx512
|
|
cmpl $11,%eax
|
|
je .Laes_gcm_encrypt_192_avx512
|
|
cmpl $13,%eax
|
|
je .Laes_gcm_encrypt_256_avx512
|
|
xorl %eax,%eax
|
|
jmp .Lexit_gcm_encrypt
|
|
.align 32
|
|
.Laes_gcm_encrypt_128_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_10
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_11
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_11
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_11:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_11
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_11
|
|
|
|
.L_partial_incomplete_11:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_11:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_11:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_10
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_10
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_12
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_12
|
|
.L_next_16_overflow_12:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_12:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_13
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_13:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_10
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_14
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_14
|
|
.L_next_16_overflow_14:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_14:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_15
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_15:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_10
|
|
.L_encrypt_big_nblocks_10:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_16
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_16
|
|
.L_16_blocks_overflow_16:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_16:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_17
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_17
|
|
.L_16_blocks_overflow_17:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_17:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_18
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_18
|
|
.L_16_blocks_overflow_18:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_18:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_10
|
|
|
|
.L_no_more_big_nblocks_10:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_10
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_10
|
|
.L_encrypt_0_blocks_ghash_32_10:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_19
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_19
|
|
jb .L_last_num_blocks_is_7_1_19
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_19
|
|
jb .L_last_num_blocks_is_11_9_19
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_19
|
|
ja .L_last_num_blocks_is_16_19
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_19
|
|
jmp .L_last_num_blocks_is_13_19
|
|
|
|
.L_last_num_blocks_is_11_9_19:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_19
|
|
ja .L_last_num_blocks_is_11_19
|
|
jmp .L_last_num_blocks_is_9_19
|
|
|
|
.L_last_num_blocks_is_7_1_19:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_19
|
|
jb .L_last_num_blocks_is_3_1_19
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_19
|
|
je .L_last_num_blocks_is_6_19
|
|
jmp .L_last_num_blocks_is_5_19
|
|
|
|
.L_last_num_blocks_is_3_1_19:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_19
|
|
je .L_last_num_blocks_is_2_19
|
|
.L_last_num_blocks_is_1_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_20
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_20
|
|
|
|
.L_16_blocks_overflow_20:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_20:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_21
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_21
|
|
.L_small_initial_partial_block_21:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_21
|
|
.L_small_initial_compute_done_21:
|
|
.L_after_reduction_21:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_2_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_22
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_22
|
|
|
|
.L_16_blocks_overflow_22:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_22:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_23
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_23
|
|
.L_small_initial_partial_block_23:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_23:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_23
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_23:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_3_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_24
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_24
|
|
|
|
.L_16_blocks_overflow_24:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_24:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_25
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_25
|
|
.L_small_initial_partial_block_25:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_25:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_25
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_25:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_4_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_26
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_26
|
|
|
|
.L_16_blocks_overflow_26:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_26:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_27
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_27
|
|
.L_small_initial_partial_block_27:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_27:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_27
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_27:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_5_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_28
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_28
|
|
|
|
.L_16_blocks_overflow_28:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_28:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_29
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_29
|
|
.L_small_initial_partial_block_29:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_29:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_29
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_29:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_6_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_30
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_30
|
|
|
|
.L_16_blocks_overflow_30:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_30:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_31
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_31
|
|
.L_small_initial_partial_block_31:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_31:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_31
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_31:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_7_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_32
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_32
|
|
|
|
.L_16_blocks_overflow_32:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_32:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_33
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_33
|
|
.L_small_initial_partial_block_33:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_33:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_33
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_33:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_8_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_34
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_34
|
|
|
|
.L_16_blocks_overflow_34:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_34:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_35
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_35
|
|
.L_small_initial_partial_block_35:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_35:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_35
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_35:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_9_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_36
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_36
|
|
|
|
.L_16_blocks_overflow_36:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_36:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_37
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_37
|
|
.L_small_initial_partial_block_37:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_37:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_37
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_37:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_10_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_38
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_38
|
|
|
|
.L_16_blocks_overflow_38:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_38:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_39
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_39
|
|
.L_small_initial_partial_block_39:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_39:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_39
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_39:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_11_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_40
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_40
|
|
|
|
.L_16_blocks_overflow_40:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_40:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_41
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_41
|
|
.L_small_initial_partial_block_41:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_41:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_41
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_41:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_12_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_42
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_42
|
|
|
|
.L_16_blocks_overflow_42:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_42:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_43
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_43
|
|
.L_small_initial_partial_block_43:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_43:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_43
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_43:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_13_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_44
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_44
|
|
|
|
.L_16_blocks_overflow_44:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_44:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_45
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_45
|
|
.L_small_initial_partial_block_45:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_45:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_45
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_45:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_14_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_46
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_46
|
|
|
|
.L_16_blocks_overflow_46:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_46:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_47
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_47
|
|
.L_small_initial_partial_block_47:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_47:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_47
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_47:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_15_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_48
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_48
|
|
|
|
.L_16_blocks_overflow_48:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_48:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_49
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_49
|
|
.L_small_initial_partial_block_49:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_49:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_49
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_49:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_16_19:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_50
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_50
|
|
|
|
.L_16_blocks_overflow_50:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_50:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_51:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_51:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_51:
|
|
jmp .L_last_blocks_done_19
|
|
.L_last_num_blocks_is_0_19:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_19:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_10
|
|
.L_encrypt_32_blocks_10:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_52
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_52
|
|
.L_16_blocks_overflow_52:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_52:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_53
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_53
|
|
.L_16_blocks_overflow_53:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_53:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_54
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_54
|
|
jb .L_last_num_blocks_is_7_1_54
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_54
|
|
jb .L_last_num_blocks_is_11_9_54
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_54
|
|
ja .L_last_num_blocks_is_16_54
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_54
|
|
jmp .L_last_num_blocks_is_13_54
|
|
|
|
.L_last_num_blocks_is_11_9_54:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_54
|
|
ja .L_last_num_blocks_is_11_54
|
|
jmp .L_last_num_blocks_is_9_54
|
|
|
|
.L_last_num_blocks_is_7_1_54:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_54
|
|
jb .L_last_num_blocks_is_3_1_54
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_54
|
|
je .L_last_num_blocks_is_6_54
|
|
jmp .L_last_num_blocks_is_5_54
|
|
|
|
.L_last_num_blocks_is_3_1_54:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_54
|
|
je .L_last_num_blocks_is_2_54
|
|
.L_last_num_blocks_is_1_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_55
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_55
|
|
|
|
.L_16_blocks_overflow_55:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_55:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_56
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_56
|
|
.L_small_initial_partial_block_56:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_56
|
|
.L_small_initial_compute_done_56:
|
|
.L_after_reduction_56:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_2_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_57
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_57
|
|
|
|
.L_16_blocks_overflow_57:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_57:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_58
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_58
|
|
.L_small_initial_partial_block_58:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_58:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_58
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_58:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_3_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_59
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_59
|
|
|
|
.L_16_blocks_overflow_59:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_59:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_60
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_60
|
|
.L_small_initial_partial_block_60:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_60:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_60
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_60:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_4_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_61
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_61
|
|
|
|
.L_16_blocks_overflow_61:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_61:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_62
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_62
|
|
.L_small_initial_partial_block_62:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_62:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_62
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_62:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_5_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_63
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_63
|
|
|
|
.L_16_blocks_overflow_63:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_63:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_64
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_64
|
|
.L_small_initial_partial_block_64:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_64:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_64
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_64:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_6_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_65
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_65
|
|
|
|
.L_16_blocks_overflow_65:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_65:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_66
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_66
|
|
.L_small_initial_partial_block_66:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_66:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_66
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_66:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_7_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_67
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_67
|
|
|
|
.L_16_blocks_overflow_67:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_67:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_68
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_68
|
|
.L_small_initial_partial_block_68:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_68:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_68
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_68:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_8_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_69
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_69
|
|
|
|
.L_16_blocks_overflow_69:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_69:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_70
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_70
|
|
.L_small_initial_partial_block_70:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_70:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_70
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_70:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_9_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_71
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_71
|
|
|
|
.L_16_blocks_overflow_71:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_71:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_72
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_72
|
|
.L_small_initial_partial_block_72:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_72:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_72
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_72:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_10_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_73
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_73
|
|
|
|
.L_16_blocks_overflow_73:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_73:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_74
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_74
|
|
.L_small_initial_partial_block_74:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_74:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_74
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_74:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_11_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_75
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_75
|
|
|
|
.L_16_blocks_overflow_75:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_75:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_76
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_76
|
|
.L_small_initial_partial_block_76:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_76:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_76
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_76:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_12_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_77
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_77
|
|
|
|
.L_16_blocks_overflow_77:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_77:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_78
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_78
|
|
.L_small_initial_partial_block_78:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_78:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_78
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_78:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_13_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_79
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_79
|
|
|
|
.L_16_blocks_overflow_79:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_79:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_80
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_80
|
|
.L_small_initial_partial_block_80:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_80:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_80
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_80:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_14_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_81
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_81
|
|
|
|
.L_16_blocks_overflow_81:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_81:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_82
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_82
|
|
.L_small_initial_partial_block_82:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_82:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_82
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_82:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_15_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_83
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_83
|
|
|
|
.L_16_blocks_overflow_83:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_83:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_84
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_84
|
|
.L_small_initial_partial_block_84:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_84:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_84
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_84:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_16_54:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_85
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_85
|
|
|
|
.L_16_blocks_overflow_85:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_85:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_86:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_86:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_86:
|
|
jmp .L_last_blocks_done_54
|
|
.L_last_num_blocks_is_0_54:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_54:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_10
|
|
.L_encrypt_16_blocks_10:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_87
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_87
|
|
.L_16_blocks_overflow_87:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_87:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_88
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_88
|
|
jb .L_last_num_blocks_is_7_1_88
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_88
|
|
jb .L_last_num_blocks_is_11_9_88
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_88
|
|
ja .L_last_num_blocks_is_16_88
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_88
|
|
jmp .L_last_num_blocks_is_13_88
|
|
|
|
.L_last_num_blocks_is_11_9_88:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_88
|
|
ja .L_last_num_blocks_is_11_88
|
|
jmp .L_last_num_blocks_is_9_88
|
|
|
|
.L_last_num_blocks_is_7_1_88:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_88
|
|
jb .L_last_num_blocks_is_3_1_88
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_88
|
|
je .L_last_num_blocks_is_6_88
|
|
jmp .L_last_num_blocks_is_5_88
|
|
|
|
.L_last_num_blocks_is_3_1_88:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_88
|
|
je .L_last_num_blocks_is_2_88
|
|
.L_last_num_blocks_is_1_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_89
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_89
|
|
|
|
.L_16_blocks_overflow_89:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_89:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_90
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_90
|
|
.L_small_initial_partial_block_90:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_90
|
|
.L_small_initial_compute_done_90:
|
|
.L_after_reduction_90:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_2_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_91
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_91
|
|
|
|
.L_16_blocks_overflow_91:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_91:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_92
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_92
|
|
.L_small_initial_partial_block_92:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_92:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_92
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_92:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_3_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_93
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_93
|
|
|
|
.L_16_blocks_overflow_93:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_93:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_94
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_94
|
|
.L_small_initial_partial_block_94:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_94:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_94
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_94:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_4_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_95
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_95
|
|
|
|
.L_16_blocks_overflow_95:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_95:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_96
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_96
|
|
.L_small_initial_partial_block_96:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_96:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_96
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_96:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_5_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_97
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_97
|
|
|
|
.L_16_blocks_overflow_97:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_97:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_98
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_98
|
|
.L_small_initial_partial_block_98:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_98:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_98
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_98:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_6_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_99
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_99
|
|
|
|
.L_16_blocks_overflow_99:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_99:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_100
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_100
|
|
.L_small_initial_partial_block_100:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_100:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_100
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_100:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_7_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_101
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_101
|
|
|
|
.L_16_blocks_overflow_101:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_101:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_102
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_102
|
|
.L_small_initial_partial_block_102:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_102:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_102
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_102:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_8_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_103
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_103
|
|
|
|
.L_16_blocks_overflow_103:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_103:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_104
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_104
|
|
.L_small_initial_partial_block_104:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_104:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_104
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_104:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_9_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_105
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_105
|
|
|
|
.L_16_blocks_overflow_105:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_105:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_106
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_106
|
|
.L_small_initial_partial_block_106:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_106:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_106
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_106:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_10_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_107
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_107
|
|
|
|
.L_16_blocks_overflow_107:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_107:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_108
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_108
|
|
.L_small_initial_partial_block_108:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_108:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_108
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_108:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_11_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_109
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_109
|
|
|
|
.L_16_blocks_overflow_109:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_109:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_110
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_110
|
|
.L_small_initial_partial_block_110:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_110:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_110
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_110:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_12_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_111
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_111
|
|
|
|
.L_16_blocks_overflow_111:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_111:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_112
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_112
|
|
.L_small_initial_partial_block_112:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_112:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_112
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_112:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_13_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_113
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_113
|
|
|
|
.L_16_blocks_overflow_113:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_113:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_114
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_114
|
|
.L_small_initial_partial_block_114:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_114:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_114
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_114:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_14_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_115
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_115
|
|
|
|
.L_16_blocks_overflow_115:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_115:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_116
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_116
|
|
.L_small_initial_partial_block_116:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_116:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_116
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_116:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_15_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_117
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_117
|
|
|
|
.L_16_blocks_overflow_117:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_117:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_118
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_118
|
|
.L_small_initial_partial_block_118:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_118:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_118
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_118:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_16_88:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_119
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_119
|
|
|
|
.L_16_blocks_overflow_119:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_119:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_120:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_120:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_120:
|
|
jmp .L_last_blocks_done_88
|
|
.L_last_num_blocks_is_0_88:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_88:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_10
|
|
|
|
.L_message_below_32_blocks_10:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_121
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_121:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_122
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_122
|
|
jb .L_last_num_blocks_is_7_1_122
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_122
|
|
jb .L_last_num_blocks_is_11_9_122
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_122
|
|
ja .L_last_num_blocks_is_16_122
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_122
|
|
jmp .L_last_num_blocks_is_13_122
|
|
|
|
.L_last_num_blocks_is_11_9_122:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_122
|
|
ja .L_last_num_blocks_is_11_122
|
|
jmp .L_last_num_blocks_is_9_122
|
|
|
|
.L_last_num_blocks_is_7_1_122:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_122
|
|
jb .L_last_num_blocks_is_3_1_122
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_122
|
|
je .L_last_num_blocks_is_6_122
|
|
jmp .L_last_num_blocks_is_5_122
|
|
|
|
.L_last_num_blocks_is_3_1_122:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_122
|
|
je .L_last_num_blocks_is_2_122
|
|
.L_last_num_blocks_is_1_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_123
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_123
|
|
|
|
.L_16_blocks_overflow_123:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_123:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_124
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_124
|
|
.L_small_initial_partial_block_124:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_124
|
|
.L_small_initial_compute_done_124:
|
|
.L_after_reduction_124:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_2_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_125
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_125
|
|
|
|
.L_16_blocks_overflow_125:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_125:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_126
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_126
|
|
.L_small_initial_partial_block_126:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_126:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_126
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_126:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_3_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_127
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_127
|
|
|
|
.L_16_blocks_overflow_127:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_127:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_128
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_128
|
|
.L_small_initial_partial_block_128:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_128:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_128
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_128:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_4_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_129
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_129
|
|
|
|
.L_16_blocks_overflow_129:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_129:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_130
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_130
|
|
.L_small_initial_partial_block_130:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_130:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_130
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_130:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_5_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_131
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_131
|
|
|
|
.L_16_blocks_overflow_131:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_131:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_132
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_132
|
|
.L_small_initial_partial_block_132:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_132:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_132
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_132:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_6_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_133
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_133
|
|
|
|
.L_16_blocks_overflow_133:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_133:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_134
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_134
|
|
.L_small_initial_partial_block_134:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_134:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_134
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_134:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_7_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_135
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_135
|
|
|
|
.L_16_blocks_overflow_135:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_135:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_136
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_136
|
|
.L_small_initial_partial_block_136:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_136:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_136
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_136:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_8_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_137
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_137
|
|
|
|
.L_16_blocks_overflow_137:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_137:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_138
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_138
|
|
.L_small_initial_partial_block_138:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_138:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_138
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_138:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_9_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_139
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_139
|
|
|
|
.L_16_blocks_overflow_139:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_139:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_140
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_140
|
|
.L_small_initial_partial_block_140:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_140:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_140
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_140:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_10_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_141
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_141
|
|
|
|
.L_16_blocks_overflow_141:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_141:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_142
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_142
|
|
.L_small_initial_partial_block_142:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_142:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_142
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_142:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_11_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_143
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_143
|
|
|
|
.L_16_blocks_overflow_143:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_143:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_144
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_144
|
|
.L_small_initial_partial_block_144:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_144:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_144
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_144:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_12_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_145
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_145
|
|
|
|
.L_16_blocks_overflow_145:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_145:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_146
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_146
|
|
.L_small_initial_partial_block_146:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_146:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_146
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_146:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_13_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_147
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_147
|
|
|
|
.L_16_blocks_overflow_147:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_147:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_148
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_148
|
|
.L_small_initial_partial_block_148:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_148:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_148
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_148:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_14_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_149
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_149
|
|
|
|
.L_16_blocks_overflow_149:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_149:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_150
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_150
|
|
.L_small_initial_partial_block_150:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_150:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_150
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_150:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_15_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_151
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_151
|
|
|
|
.L_16_blocks_overflow_151:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_151:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_152
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_152
|
|
.L_small_initial_partial_block_152:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_152:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_152
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_152:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_16_122:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_153
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_153
|
|
|
|
.L_16_blocks_overflow_153:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_153:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_154:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_154:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_154:
|
|
jmp .L_last_blocks_done_122
|
|
.L_last_num_blocks_is_0_122:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_122:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_10
|
|
|
|
.L_message_below_equal_16_blocks_10:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_155
|
|
jl .L_small_initial_num_blocks_is_7_1_155
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_155
|
|
jl .L_small_initial_num_blocks_is_11_9_155
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_155
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_155
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_155
|
|
jmp .L_small_initial_num_blocks_is_13_155
|
|
|
|
.L_small_initial_num_blocks_is_11_9_155:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_155
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_155
|
|
jmp .L_small_initial_num_blocks_is_9_155
|
|
|
|
.L_small_initial_num_blocks_is_7_1_155:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_155
|
|
jl .L_small_initial_num_blocks_is_3_1_155
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_155
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_155
|
|
jmp .L_small_initial_num_blocks_is_5_155
|
|
|
|
.L_small_initial_num_blocks_is_3_1_155:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_155
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_155
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_156
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_156
|
|
.L_small_initial_partial_block_156:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_156
|
|
.L_small_initial_compute_done_156:
|
|
.L_after_reduction_156:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_2_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_157
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_157
|
|
.L_small_initial_partial_block_157:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_157:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_157
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_157:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_3_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_158
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_158
|
|
.L_small_initial_partial_block_158:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_158:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_158
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_158:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_4_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_159
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_159
|
|
.L_small_initial_partial_block_159:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_159:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_159
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_159:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_5_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %xmm29,%xmm3,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_160
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_160
|
|
.L_small_initial_partial_block_160:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_160:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_160
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_160:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_6_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %ymm29,%ymm3,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_161
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_161
|
|
.L_small_initial_partial_block_161:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_161:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_161
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_161:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_7_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_162
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_162
|
|
.L_small_initial_partial_block_162:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_162:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_162
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_162:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_8_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_163
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_163
|
|
.L_small_initial_partial_block_163:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_163:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_163
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_163:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_9_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %xmm29,%xmm4,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_164
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_164
|
|
.L_small_initial_partial_block_164:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_164:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_164
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_164:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_10_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %ymm29,%ymm4,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_165
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_165
|
|
.L_small_initial_partial_block_165:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_165:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_165
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_165:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_11_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_166
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_166
|
|
.L_small_initial_partial_block_166:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_166:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_166
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_166:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_12_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_167
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_167
|
|
.L_small_initial_partial_block_167:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_167:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_167
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_167:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_13_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %xmm29,%xmm5,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_168
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_168
|
|
.L_small_initial_partial_block_168:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_168:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_168
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_168:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_14_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %ymm29,%ymm5,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_169
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_169
|
|
.L_small_initial_partial_block_169:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_169:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_169
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_169:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_15_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_170
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_170
|
|
.L_small_initial_partial_block_170:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_170:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_170
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_170:
|
|
jmp .L_small_initial_blocks_encrypted_155
|
|
.L_small_initial_num_blocks_is_16_155:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_171:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_171:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_171:
|
|
.L_small_initial_blocks_encrypted_155:
|
|
.L_ghash_done_10:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_10:
|
|
jmp .Lexit_gcm_encrypt
|
|
.align 32
|
|
.Laes_gcm_encrypt_192_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_172
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_173
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_173
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_173:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_173
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_173
|
|
|
|
.L_partial_incomplete_173:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_173:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_173:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_172
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_172
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_174
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_174
|
|
.L_next_16_overflow_174:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_174:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_175
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_175:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_172
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_176
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_176
|
|
.L_next_16_overflow_176:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_176:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_177
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_177:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_172
|
|
.L_encrypt_big_nblocks_172:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_178
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_178
|
|
.L_16_blocks_overflow_178:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_178:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_179
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_179
|
|
.L_16_blocks_overflow_179:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_179:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_180
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_180
|
|
.L_16_blocks_overflow_180:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_180:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_172
|
|
|
|
.L_no_more_big_nblocks_172:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_172
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_172
|
|
.L_encrypt_0_blocks_ghash_32_172:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_181
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_181
|
|
jb .L_last_num_blocks_is_7_1_181
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_181
|
|
jb .L_last_num_blocks_is_11_9_181
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_181
|
|
ja .L_last_num_blocks_is_16_181
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_181
|
|
jmp .L_last_num_blocks_is_13_181
|
|
|
|
.L_last_num_blocks_is_11_9_181:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_181
|
|
ja .L_last_num_blocks_is_11_181
|
|
jmp .L_last_num_blocks_is_9_181
|
|
|
|
.L_last_num_blocks_is_7_1_181:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_181
|
|
jb .L_last_num_blocks_is_3_1_181
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_181
|
|
je .L_last_num_blocks_is_6_181
|
|
jmp .L_last_num_blocks_is_5_181
|
|
|
|
.L_last_num_blocks_is_3_1_181:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_181
|
|
je .L_last_num_blocks_is_2_181
|
|
.L_last_num_blocks_is_1_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_182
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_182
|
|
|
|
.L_16_blocks_overflow_182:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_182:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_183
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_183
|
|
.L_small_initial_partial_block_183:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_183
|
|
.L_small_initial_compute_done_183:
|
|
.L_after_reduction_183:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_2_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_184
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_184
|
|
|
|
.L_16_blocks_overflow_184:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_184:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_185
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_185
|
|
.L_small_initial_partial_block_185:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_185:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_185
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_185:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_3_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_186
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_186
|
|
|
|
.L_16_blocks_overflow_186:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_186:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_187
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_187
|
|
.L_small_initial_partial_block_187:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_187:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_187
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_187:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_4_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_188
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_188
|
|
|
|
.L_16_blocks_overflow_188:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_188:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_189
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_189
|
|
.L_small_initial_partial_block_189:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_189:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_189
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_189:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_5_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_190
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_190
|
|
|
|
.L_16_blocks_overflow_190:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_190:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_191
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_191
|
|
.L_small_initial_partial_block_191:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_191:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_191
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_191:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_6_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_192
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_192
|
|
|
|
.L_16_blocks_overflow_192:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_192:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_193
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_193
|
|
.L_small_initial_partial_block_193:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_193:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_193
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_193:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_7_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_194
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_194
|
|
|
|
.L_16_blocks_overflow_194:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_194:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_195
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_195
|
|
.L_small_initial_partial_block_195:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_195:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_195
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_195:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_8_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_196
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_196
|
|
|
|
.L_16_blocks_overflow_196:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_196:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_197
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_197
|
|
.L_small_initial_partial_block_197:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_197:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_197
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_197:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_9_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_198
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_198
|
|
|
|
.L_16_blocks_overflow_198:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_198:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_199
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_199
|
|
.L_small_initial_partial_block_199:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_199:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_199
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_199:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_10_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_200
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_200
|
|
|
|
.L_16_blocks_overflow_200:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_200:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_201
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_201
|
|
.L_small_initial_partial_block_201:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_201:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_201
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_201:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_11_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_202
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_202
|
|
|
|
.L_16_blocks_overflow_202:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_202:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_203
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_203
|
|
.L_small_initial_partial_block_203:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_203:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_203
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_203:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_12_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_204
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_204
|
|
|
|
.L_16_blocks_overflow_204:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_204:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_205
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_205
|
|
.L_small_initial_partial_block_205:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_205:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_205
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_205:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_13_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_206
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_206
|
|
|
|
.L_16_blocks_overflow_206:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_206:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_207
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_207
|
|
.L_small_initial_partial_block_207:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_207:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_207
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_207:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_14_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_208
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_208
|
|
|
|
.L_16_blocks_overflow_208:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_208:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_209
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_209
|
|
.L_small_initial_partial_block_209:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_209:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_209
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_209:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_15_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_210
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_210
|
|
|
|
.L_16_blocks_overflow_210:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_210:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_211
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_211
|
|
.L_small_initial_partial_block_211:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_211:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_211
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_211:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_16_181:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_212
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_212
|
|
|
|
.L_16_blocks_overflow_212:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_212:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_213:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_213:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_213:
|
|
jmp .L_last_blocks_done_181
|
|
.L_last_num_blocks_is_0_181:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_181:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_172
|
|
.L_encrypt_32_blocks_172:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_214
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_214
|
|
.L_16_blocks_overflow_214:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_214:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_215
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_215
|
|
.L_16_blocks_overflow_215:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_215:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_216
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_216
|
|
jb .L_last_num_blocks_is_7_1_216
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_216
|
|
jb .L_last_num_blocks_is_11_9_216
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_216
|
|
ja .L_last_num_blocks_is_16_216
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_216
|
|
jmp .L_last_num_blocks_is_13_216
|
|
|
|
.L_last_num_blocks_is_11_9_216:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_216
|
|
ja .L_last_num_blocks_is_11_216
|
|
jmp .L_last_num_blocks_is_9_216
|
|
|
|
.L_last_num_blocks_is_7_1_216:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_216
|
|
jb .L_last_num_blocks_is_3_1_216
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_216
|
|
je .L_last_num_blocks_is_6_216
|
|
jmp .L_last_num_blocks_is_5_216
|
|
|
|
.L_last_num_blocks_is_3_1_216:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_216
|
|
je .L_last_num_blocks_is_2_216
|
|
.L_last_num_blocks_is_1_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_217
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_217
|
|
|
|
.L_16_blocks_overflow_217:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_217:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_218
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_218
|
|
.L_small_initial_partial_block_218:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_218
|
|
.L_small_initial_compute_done_218:
|
|
.L_after_reduction_218:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_2_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_219
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_219
|
|
|
|
.L_16_blocks_overflow_219:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_219:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_220
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_220
|
|
.L_small_initial_partial_block_220:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_220:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_220
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_220:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_3_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_221
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_221
|
|
|
|
.L_16_blocks_overflow_221:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_221:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_222
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_222
|
|
.L_small_initial_partial_block_222:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_222:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_222
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_222:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_4_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_223
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_223
|
|
|
|
.L_16_blocks_overflow_223:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_223:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_224
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_224
|
|
.L_small_initial_partial_block_224:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_224:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_224
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_224:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_5_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_225
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_225
|
|
|
|
.L_16_blocks_overflow_225:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_225:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_226
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_226
|
|
.L_small_initial_partial_block_226:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_226:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_226
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_226:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_6_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_227
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_227
|
|
|
|
.L_16_blocks_overflow_227:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_227:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_228
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_228
|
|
.L_small_initial_partial_block_228:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_228:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_228
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_228:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_7_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_229
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_229
|
|
|
|
.L_16_blocks_overflow_229:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_229:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_230
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_230
|
|
.L_small_initial_partial_block_230:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_230:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_230
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_230:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_8_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_231
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_231
|
|
|
|
.L_16_blocks_overflow_231:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_231:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_232
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_232
|
|
.L_small_initial_partial_block_232:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_232:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_232
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_232:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_9_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_233
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_233
|
|
|
|
.L_16_blocks_overflow_233:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_233:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_234
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_234
|
|
.L_small_initial_partial_block_234:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_234:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_234
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_234:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_10_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_235
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_235
|
|
|
|
.L_16_blocks_overflow_235:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_235:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_236
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_236
|
|
.L_small_initial_partial_block_236:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_236:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_236
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_236:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_11_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_237
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_237
|
|
|
|
.L_16_blocks_overflow_237:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_237:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_238
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_238
|
|
.L_small_initial_partial_block_238:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_238:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_238
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_238:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_12_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_239
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_239
|
|
|
|
.L_16_blocks_overflow_239:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_239:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_240
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_240
|
|
.L_small_initial_partial_block_240:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_240:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_240
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_240:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_13_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_241
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_241
|
|
|
|
.L_16_blocks_overflow_241:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_241:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_242
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_242
|
|
.L_small_initial_partial_block_242:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_242:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_242
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_242:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_14_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_243
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_243
|
|
|
|
.L_16_blocks_overflow_243:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_243:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_244
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_244
|
|
.L_small_initial_partial_block_244:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_244:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_244
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_244:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_15_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_245
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_245
|
|
|
|
.L_16_blocks_overflow_245:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_245:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_246
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_246
|
|
.L_small_initial_partial_block_246:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_246:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_246
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_246:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_16_216:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_247
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_247
|
|
|
|
.L_16_blocks_overflow_247:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_247:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_248:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_248:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_248:
|
|
jmp .L_last_blocks_done_216
|
|
.L_last_num_blocks_is_0_216:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_216:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_172
|
|
.L_encrypt_16_blocks_172:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_249
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_249
|
|
.L_16_blocks_overflow_249:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_249:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_250
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_250
|
|
jb .L_last_num_blocks_is_7_1_250
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_250
|
|
jb .L_last_num_blocks_is_11_9_250
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_250
|
|
ja .L_last_num_blocks_is_16_250
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_250
|
|
jmp .L_last_num_blocks_is_13_250
|
|
|
|
.L_last_num_blocks_is_11_9_250:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_250
|
|
ja .L_last_num_blocks_is_11_250
|
|
jmp .L_last_num_blocks_is_9_250
|
|
|
|
.L_last_num_blocks_is_7_1_250:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_250
|
|
jb .L_last_num_blocks_is_3_1_250
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_250
|
|
je .L_last_num_blocks_is_6_250
|
|
jmp .L_last_num_blocks_is_5_250
|
|
|
|
.L_last_num_blocks_is_3_1_250:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_250
|
|
je .L_last_num_blocks_is_2_250
|
|
.L_last_num_blocks_is_1_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_251
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_251
|
|
|
|
.L_16_blocks_overflow_251:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_251:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_252
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_252
|
|
.L_small_initial_partial_block_252:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_252
|
|
.L_small_initial_compute_done_252:
|
|
.L_after_reduction_252:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_2_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_253
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_253
|
|
|
|
.L_16_blocks_overflow_253:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_253:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_254
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_254
|
|
.L_small_initial_partial_block_254:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_254:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_254
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_254:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_3_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_255
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_255
|
|
|
|
.L_16_blocks_overflow_255:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_255:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_256
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_256
|
|
.L_small_initial_partial_block_256:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_256:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_256
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_256:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_4_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_257
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_257
|
|
|
|
.L_16_blocks_overflow_257:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_257:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_258
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_258
|
|
.L_small_initial_partial_block_258:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_258:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_258
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_258:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_5_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_259
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_259
|
|
|
|
.L_16_blocks_overflow_259:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_259:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_260
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_260
|
|
.L_small_initial_partial_block_260:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_260:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_260
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_260:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_6_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_261
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_261
|
|
|
|
.L_16_blocks_overflow_261:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_261:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_262
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_262
|
|
.L_small_initial_partial_block_262:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_262:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_262
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_262:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_7_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_263
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_263
|
|
|
|
.L_16_blocks_overflow_263:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_263:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_264
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_264
|
|
.L_small_initial_partial_block_264:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_264:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_264
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_264:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_8_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_265
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_265
|
|
|
|
.L_16_blocks_overflow_265:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_265:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_266
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_266
|
|
.L_small_initial_partial_block_266:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_266:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_266
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_266:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_9_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_267
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_267
|
|
|
|
.L_16_blocks_overflow_267:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_267:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_268
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_268
|
|
.L_small_initial_partial_block_268:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_268:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_268
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_268:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_10_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_269
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_269
|
|
|
|
.L_16_blocks_overflow_269:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_269:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_270
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_270
|
|
.L_small_initial_partial_block_270:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_270:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_270
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_270:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_11_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_271
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_271
|
|
|
|
.L_16_blocks_overflow_271:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_271:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_272
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_272
|
|
.L_small_initial_partial_block_272:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_272:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_272
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_272:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_12_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_273
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_273
|
|
|
|
.L_16_blocks_overflow_273:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_273:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_274
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_274
|
|
.L_small_initial_partial_block_274:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_274:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_274
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_274:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_13_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_275
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_275
|
|
|
|
.L_16_blocks_overflow_275:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_275:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_276
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_276
|
|
.L_small_initial_partial_block_276:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_276:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_276
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_276:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_14_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_277
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_277
|
|
|
|
.L_16_blocks_overflow_277:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_277:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_278
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_278
|
|
.L_small_initial_partial_block_278:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_278:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_278
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_278:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_15_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_279
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_279
|
|
|
|
.L_16_blocks_overflow_279:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_279:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_280
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_280
|
|
.L_small_initial_partial_block_280:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_280:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_280
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_280:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_16_250:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_281
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_281
|
|
|
|
.L_16_blocks_overflow_281:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_281:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_282:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_282:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_282:
|
|
jmp .L_last_blocks_done_250
|
|
.L_last_num_blocks_is_0_250:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_250:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_172
|
|
|
|
.L_message_below_32_blocks_172:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_283
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_283:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_284
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_284
|
|
jb .L_last_num_blocks_is_7_1_284
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_284
|
|
jb .L_last_num_blocks_is_11_9_284
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_284
|
|
ja .L_last_num_blocks_is_16_284
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_284
|
|
jmp .L_last_num_blocks_is_13_284
|
|
|
|
.L_last_num_blocks_is_11_9_284:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_284
|
|
ja .L_last_num_blocks_is_11_284
|
|
jmp .L_last_num_blocks_is_9_284
|
|
|
|
.L_last_num_blocks_is_7_1_284:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_284
|
|
jb .L_last_num_blocks_is_3_1_284
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_284
|
|
je .L_last_num_blocks_is_6_284
|
|
jmp .L_last_num_blocks_is_5_284
|
|
|
|
.L_last_num_blocks_is_3_1_284:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_284
|
|
je .L_last_num_blocks_is_2_284
|
|
.L_last_num_blocks_is_1_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_285
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_285
|
|
|
|
.L_16_blocks_overflow_285:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_285:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_286
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_286
|
|
.L_small_initial_partial_block_286:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_286
|
|
.L_small_initial_compute_done_286:
|
|
.L_after_reduction_286:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_2_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_287
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_287
|
|
|
|
.L_16_blocks_overflow_287:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_287:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_288
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_288
|
|
.L_small_initial_partial_block_288:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_288:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_288
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_288:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_3_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_289
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_289
|
|
|
|
.L_16_blocks_overflow_289:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_289:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_290
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_290
|
|
.L_small_initial_partial_block_290:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_290:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_290
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_290:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_4_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_291
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_291
|
|
|
|
.L_16_blocks_overflow_291:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_291:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_292
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_292
|
|
.L_small_initial_partial_block_292:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_292:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_292
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_292:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_5_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_293
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_293
|
|
|
|
.L_16_blocks_overflow_293:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_293:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_294
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_294
|
|
.L_small_initial_partial_block_294:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_294:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_294
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_294:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_6_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_295
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_295
|
|
|
|
.L_16_blocks_overflow_295:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_295:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_296
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_296
|
|
.L_small_initial_partial_block_296:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_296:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_296
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_296:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_7_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_297
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_297
|
|
|
|
.L_16_blocks_overflow_297:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_297:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_298
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_298
|
|
.L_small_initial_partial_block_298:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_298:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_298
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_298:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_8_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_299
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_299
|
|
|
|
.L_16_blocks_overflow_299:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_299:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_300
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_300
|
|
.L_small_initial_partial_block_300:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_300:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_300
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_300:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_9_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_301
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_301
|
|
|
|
.L_16_blocks_overflow_301:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_301:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_302
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_302
|
|
.L_small_initial_partial_block_302:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_302:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_302
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_302:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_10_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_303
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_303
|
|
|
|
.L_16_blocks_overflow_303:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_303:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_304
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_304
|
|
.L_small_initial_partial_block_304:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_304:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_304
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_304:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_11_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_305
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_305
|
|
|
|
.L_16_blocks_overflow_305:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_305:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_306
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_306
|
|
.L_small_initial_partial_block_306:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_306:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_306
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_306:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_12_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_307
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_307
|
|
|
|
.L_16_blocks_overflow_307:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_307:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_308
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_308
|
|
.L_small_initial_partial_block_308:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_308:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_308
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_308:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_13_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_309
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_309
|
|
|
|
.L_16_blocks_overflow_309:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_309:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_310
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_310
|
|
.L_small_initial_partial_block_310:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_310:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_310
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_310:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_14_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_311
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_311
|
|
|
|
.L_16_blocks_overflow_311:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_311:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_312
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_312
|
|
.L_small_initial_partial_block_312:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_312:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_312
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_312:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_15_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_313
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_313
|
|
|
|
.L_16_blocks_overflow_313:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_313:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_314
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_314
|
|
.L_small_initial_partial_block_314:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_314:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_314
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_314:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_16_284:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_315
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_315
|
|
|
|
.L_16_blocks_overflow_315:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_315:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_316:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_316:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_316:
|
|
jmp .L_last_blocks_done_284
|
|
.L_last_num_blocks_is_0_284:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_284:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_172
|
|
|
|
.L_message_below_equal_16_blocks_172:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_317
|
|
jl .L_small_initial_num_blocks_is_7_1_317
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_317
|
|
jl .L_small_initial_num_blocks_is_11_9_317
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_317
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_317
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_317
|
|
jmp .L_small_initial_num_blocks_is_13_317
|
|
|
|
.L_small_initial_num_blocks_is_11_9_317:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_317
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_317
|
|
jmp .L_small_initial_num_blocks_is_9_317
|
|
|
|
.L_small_initial_num_blocks_is_7_1_317:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_317
|
|
jl .L_small_initial_num_blocks_is_3_1_317
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_317
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_317
|
|
jmp .L_small_initial_num_blocks_is_5_317
|
|
|
|
.L_small_initial_num_blocks_is_3_1_317:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_317
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_317
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_318
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_318
|
|
.L_small_initial_partial_block_318:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_318
|
|
.L_small_initial_compute_done_318:
|
|
.L_after_reduction_318:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_2_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_319
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_319
|
|
.L_small_initial_partial_block_319:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_319:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_319
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_319:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_3_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_320
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_320
|
|
.L_small_initial_partial_block_320:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_320:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_320
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_320:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_4_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_321
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_321
|
|
.L_small_initial_partial_block_321:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_321:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_321
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_321:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_5_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %xmm29,%xmm3,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_322
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_322
|
|
.L_small_initial_partial_block_322:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_322:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_322
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_322:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_6_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %ymm29,%ymm3,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_323
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_323
|
|
.L_small_initial_partial_block_323:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_323:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_323
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_323:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_7_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_324
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_324
|
|
.L_small_initial_partial_block_324:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_324:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_324
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_324:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_8_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_325
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_325
|
|
.L_small_initial_partial_block_325:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_325:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_325
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_325:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_9_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %xmm29,%xmm4,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_326
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_326
|
|
.L_small_initial_partial_block_326:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_326:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_326
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_326:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_10_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %ymm29,%ymm4,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_327
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_327
|
|
.L_small_initial_partial_block_327:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_327:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_327
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_327:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_11_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_328
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_328
|
|
.L_small_initial_partial_block_328:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_328:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_328
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_328:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_12_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_329
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_329
|
|
.L_small_initial_partial_block_329:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_329:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_329
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_329:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_13_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %xmm29,%xmm5,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_330
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_330
|
|
.L_small_initial_partial_block_330:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_330:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_330
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_330:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_14_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %ymm29,%ymm5,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_331
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_331
|
|
.L_small_initial_partial_block_331:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_331:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_331
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_331:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_15_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_332
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_332
|
|
.L_small_initial_partial_block_332:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_332:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_332
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_332:
|
|
jmp .L_small_initial_blocks_encrypted_317
|
|
.L_small_initial_num_blocks_is_16_317:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_333:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_333:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_333:
|
|
.L_small_initial_blocks_encrypted_317:
|
|
.L_ghash_done_172:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_172:
|
|
jmp .Lexit_gcm_encrypt
|
|
.align 32
|
|
.Laes_gcm_encrypt_256_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_334
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_335
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_335
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_335:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_335
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_335
|
|
|
|
.L_partial_incomplete_335:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_335:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_335:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_334
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_334
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_336
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_336
|
|
.L_next_16_overflow_336:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_336:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 208(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 224(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_337
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_337:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_334
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_338
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_338
|
|
.L_next_16_overflow_338:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_338:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 208(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 224(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_339
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_339:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_334
|
|
.L_encrypt_big_nblocks_334:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_340
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_340
|
|
.L_16_blocks_overflow_340:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_340:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_341
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_341
|
|
.L_16_blocks_overflow_341:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_341:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_342
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_342
|
|
.L_16_blocks_overflow_342:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_342:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_334
|
|
|
|
.L_no_more_big_nblocks_334:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_334
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_334
|
|
.L_encrypt_0_blocks_ghash_32_334:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_343
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_343
|
|
jb .L_last_num_blocks_is_7_1_343
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_343
|
|
jb .L_last_num_blocks_is_11_9_343
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_343
|
|
ja .L_last_num_blocks_is_16_343
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_343
|
|
jmp .L_last_num_blocks_is_13_343
|
|
|
|
.L_last_num_blocks_is_11_9_343:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_343
|
|
ja .L_last_num_blocks_is_11_343
|
|
jmp .L_last_num_blocks_is_9_343
|
|
|
|
.L_last_num_blocks_is_7_1_343:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_343
|
|
jb .L_last_num_blocks_is_3_1_343
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_343
|
|
je .L_last_num_blocks_is_6_343
|
|
jmp .L_last_num_blocks_is_5_343
|
|
|
|
.L_last_num_blocks_is_3_1_343:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_343
|
|
je .L_last_num_blocks_is_2_343
|
|
.L_last_num_blocks_is_1_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_344
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_344
|
|
|
|
.L_16_blocks_overflow_344:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_344:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_345
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_345
|
|
.L_small_initial_partial_block_345:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_345
|
|
.L_small_initial_compute_done_345:
|
|
.L_after_reduction_345:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_2_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_346
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_346
|
|
|
|
.L_16_blocks_overflow_346:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_346:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_347
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_347
|
|
.L_small_initial_partial_block_347:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_347:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_347
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_347:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_3_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_348
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_348
|
|
|
|
.L_16_blocks_overflow_348:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_348:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_349
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_349
|
|
.L_small_initial_partial_block_349:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_349:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_349
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_349:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_4_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_350
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_350
|
|
|
|
.L_16_blocks_overflow_350:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_350:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_351
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_351
|
|
.L_small_initial_partial_block_351:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_351:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_351
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_351:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_5_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_352
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_352
|
|
|
|
.L_16_blocks_overflow_352:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_352:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_353
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_353
|
|
.L_small_initial_partial_block_353:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_353:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_353
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_353:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_6_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_354
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_354
|
|
|
|
.L_16_blocks_overflow_354:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_354:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_355
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_355
|
|
.L_small_initial_partial_block_355:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_355:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_355
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_355:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_7_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_356
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_356
|
|
|
|
.L_16_blocks_overflow_356:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_356:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_357
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_357
|
|
.L_small_initial_partial_block_357:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_357:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_357
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_357:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_8_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_358
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_358
|
|
|
|
.L_16_blocks_overflow_358:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_358:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_359
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_359
|
|
.L_small_initial_partial_block_359:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_359:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_359
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_359:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_9_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_360
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_360
|
|
|
|
.L_16_blocks_overflow_360:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_360:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_361
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_361
|
|
.L_small_initial_partial_block_361:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_361:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_361
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_361:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_10_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_362
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_362
|
|
|
|
.L_16_blocks_overflow_362:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_362:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_363
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_363
|
|
.L_small_initial_partial_block_363:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_363:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_363
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_363:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_11_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_364
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_364
|
|
|
|
.L_16_blocks_overflow_364:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_364:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_365
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_365
|
|
.L_small_initial_partial_block_365:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_365:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_365
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_365:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_12_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_366
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_366
|
|
|
|
.L_16_blocks_overflow_366:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_366:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_367
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_367
|
|
.L_small_initial_partial_block_367:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_367:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_367
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_367:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_13_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_368
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_368
|
|
|
|
.L_16_blocks_overflow_368:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_368:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_369
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_369
|
|
.L_small_initial_partial_block_369:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_369:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_369
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_369:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_14_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_370
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_370
|
|
|
|
.L_16_blocks_overflow_370:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_370:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_371
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_371
|
|
.L_small_initial_partial_block_371:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_371:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_371
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_371:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_15_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_372
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_372
|
|
|
|
.L_16_blocks_overflow_372:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_372:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_373
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_373
|
|
.L_small_initial_partial_block_373:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_373:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_373
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_373:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_16_343:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_374
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_374
|
|
|
|
.L_16_blocks_overflow_374:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_374:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_375:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_375:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_375:
|
|
jmp .L_last_blocks_done_343
|
|
.L_last_num_blocks_is_0_343:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_343:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_334
|
|
.L_encrypt_32_blocks_334:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_376
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_376
|
|
.L_16_blocks_overflow_376:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_376:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_377
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_377
|
|
.L_16_blocks_overflow_377:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_377:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_378
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_378
|
|
jb .L_last_num_blocks_is_7_1_378
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_378
|
|
jb .L_last_num_blocks_is_11_9_378
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_378
|
|
ja .L_last_num_blocks_is_16_378
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_378
|
|
jmp .L_last_num_blocks_is_13_378
|
|
|
|
.L_last_num_blocks_is_11_9_378:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_378
|
|
ja .L_last_num_blocks_is_11_378
|
|
jmp .L_last_num_blocks_is_9_378
|
|
|
|
.L_last_num_blocks_is_7_1_378:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_378
|
|
jb .L_last_num_blocks_is_3_1_378
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_378
|
|
je .L_last_num_blocks_is_6_378
|
|
jmp .L_last_num_blocks_is_5_378
|
|
|
|
.L_last_num_blocks_is_3_1_378:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_378
|
|
je .L_last_num_blocks_is_2_378
|
|
.L_last_num_blocks_is_1_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_379
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_379
|
|
|
|
.L_16_blocks_overflow_379:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_379:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_380
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_380
|
|
.L_small_initial_partial_block_380:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_380
|
|
.L_small_initial_compute_done_380:
|
|
.L_after_reduction_380:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_2_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_381
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_381
|
|
|
|
.L_16_blocks_overflow_381:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_381:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_382
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_382
|
|
.L_small_initial_partial_block_382:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_382:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_382
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_382:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_3_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_383
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_383
|
|
|
|
.L_16_blocks_overflow_383:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_383:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_384
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_384
|
|
.L_small_initial_partial_block_384:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_384:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_384
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_384:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_4_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_385
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_385
|
|
|
|
.L_16_blocks_overflow_385:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_385:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_386
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_386
|
|
.L_small_initial_partial_block_386:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_386:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_386
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_386:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_5_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_387
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_387
|
|
|
|
.L_16_blocks_overflow_387:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_387:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_388
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_388
|
|
.L_small_initial_partial_block_388:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_388:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_388
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_388:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_6_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_389
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_389
|
|
|
|
.L_16_blocks_overflow_389:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_389:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_390
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_390
|
|
.L_small_initial_partial_block_390:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_390:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_390
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_390:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_7_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_391
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_391
|
|
|
|
.L_16_blocks_overflow_391:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_391:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_392
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_392
|
|
.L_small_initial_partial_block_392:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_392:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_392
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_392:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_8_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_393
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_393
|
|
|
|
.L_16_blocks_overflow_393:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_393:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_394
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_394
|
|
.L_small_initial_partial_block_394:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_394:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_394
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_394:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_9_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_395
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_395
|
|
|
|
.L_16_blocks_overflow_395:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_395:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_396
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_396
|
|
.L_small_initial_partial_block_396:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_396:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_396
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_396:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_10_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_397
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_397
|
|
|
|
.L_16_blocks_overflow_397:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_397:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_398
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_398
|
|
.L_small_initial_partial_block_398:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_398:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_398
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_398:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_11_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_399
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_399
|
|
|
|
.L_16_blocks_overflow_399:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_399:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_400
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_400
|
|
.L_small_initial_partial_block_400:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_400:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_400
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_400:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_12_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_401
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_401
|
|
|
|
.L_16_blocks_overflow_401:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_401:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_402
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_402
|
|
.L_small_initial_partial_block_402:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_402:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_402
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_402:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_13_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_403
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_403
|
|
|
|
.L_16_blocks_overflow_403:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_403:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_404
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_404
|
|
.L_small_initial_partial_block_404:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_404:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_404
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_404:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_14_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_405
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_405
|
|
|
|
.L_16_blocks_overflow_405:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_405:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_406
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_406
|
|
.L_small_initial_partial_block_406:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_406:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_406
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_406:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_15_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_407
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_407
|
|
|
|
.L_16_blocks_overflow_407:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_407:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_408
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_408
|
|
.L_small_initial_partial_block_408:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_408:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_408
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_408:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_16_378:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_409
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_409
|
|
|
|
.L_16_blocks_overflow_409:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_409:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_410:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_410:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_410:
|
|
jmp .L_last_blocks_done_378
|
|
.L_last_num_blocks_is_0_378:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_378:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_334
|
|
.L_encrypt_16_blocks_334:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_411
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_411
|
|
.L_16_blocks_overflow_411:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_411:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_412
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_412
|
|
jb .L_last_num_blocks_is_7_1_412
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_412
|
|
jb .L_last_num_blocks_is_11_9_412
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_412
|
|
ja .L_last_num_blocks_is_16_412
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_412
|
|
jmp .L_last_num_blocks_is_13_412
|
|
|
|
.L_last_num_blocks_is_11_9_412:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_412
|
|
ja .L_last_num_blocks_is_11_412
|
|
jmp .L_last_num_blocks_is_9_412
|
|
|
|
.L_last_num_blocks_is_7_1_412:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_412
|
|
jb .L_last_num_blocks_is_3_1_412
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_412
|
|
je .L_last_num_blocks_is_6_412
|
|
jmp .L_last_num_blocks_is_5_412
|
|
|
|
.L_last_num_blocks_is_3_1_412:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_412
|
|
je .L_last_num_blocks_is_2_412
|
|
.L_last_num_blocks_is_1_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_413
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_413
|
|
|
|
.L_16_blocks_overflow_413:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_413:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_414
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_414
|
|
.L_small_initial_partial_block_414:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_414
|
|
.L_small_initial_compute_done_414:
|
|
.L_after_reduction_414:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_2_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_415
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_415
|
|
|
|
.L_16_blocks_overflow_415:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_415:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_416
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_416
|
|
.L_small_initial_partial_block_416:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_416:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_416
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_416:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_3_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_417
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_417
|
|
|
|
.L_16_blocks_overflow_417:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_417:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_418
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_418
|
|
.L_small_initial_partial_block_418:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_418:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_418
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_418:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_4_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_419
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_419
|
|
|
|
.L_16_blocks_overflow_419:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_419:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_420
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_420
|
|
.L_small_initial_partial_block_420:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_420:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_420
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_420:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_5_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_421
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_421
|
|
|
|
.L_16_blocks_overflow_421:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_421:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_422
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_422
|
|
.L_small_initial_partial_block_422:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_422:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_422
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_422:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_6_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_423
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_423
|
|
|
|
.L_16_blocks_overflow_423:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_423:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_424
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_424
|
|
.L_small_initial_partial_block_424:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_424:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_424
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_424:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_7_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_425
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_425
|
|
|
|
.L_16_blocks_overflow_425:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_425:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_426
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_426
|
|
.L_small_initial_partial_block_426:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_426:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_426
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_426:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_8_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_427
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_427
|
|
|
|
.L_16_blocks_overflow_427:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_427:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_428
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_428
|
|
.L_small_initial_partial_block_428:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_428:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_428
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_428:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_9_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_429
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_429
|
|
|
|
.L_16_blocks_overflow_429:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_429:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_430
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_430
|
|
.L_small_initial_partial_block_430:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_430:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_430
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_430:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_10_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_431
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_431
|
|
|
|
.L_16_blocks_overflow_431:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_431:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_432
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_432
|
|
.L_small_initial_partial_block_432:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_432:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_432
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_432:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_11_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_433
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_433
|
|
|
|
.L_16_blocks_overflow_433:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_433:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_434
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_434
|
|
.L_small_initial_partial_block_434:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_434:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_434
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_434:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_12_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_435
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_435
|
|
|
|
.L_16_blocks_overflow_435:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_435:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_436
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_436
|
|
.L_small_initial_partial_block_436:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_436:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_436
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_436:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_13_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_437
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_437
|
|
|
|
.L_16_blocks_overflow_437:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_437:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_438
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_438
|
|
.L_small_initial_partial_block_438:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_438:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_438
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_438:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_14_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_439
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_439
|
|
|
|
.L_16_blocks_overflow_439:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_439:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_440
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_440
|
|
.L_small_initial_partial_block_440:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_440:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_440
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_440:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_15_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_441
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_441
|
|
|
|
.L_16_blocks_overflow_441:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_441:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_442
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_442
|
|
.L_small_initial_partial_block_442:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_442:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_442
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_442:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_16_412:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_443
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_443
|
|
|
|
.L_16_blocks_overflow_443:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_443:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_444:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_444:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_444:
|
|
jmp .L_last_blocks_done_412
|
|
.L_last_num_blocks_is_0_412:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_412:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_334
|
|
|
|
.L_message_below_32_blocks_334:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_445
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_445:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_446
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_446
|
|
jb .L_last_num_blocks_is_7_1_446
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_446
|
|
jb .L_last_num_blocks_is_11_9_446
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_446
|
|
ja .L_last_num_blocks_is_16_446
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_446
|
|
jmp .L_last_num_blocks_is_13_446
|
|
|
|
.L_last_num_blocks_is_11_9_446:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_446
|
|
ja .L_last_num_blocks_is_11_446
|
|
jmp .L_last_num_blocks_is_9_446
|
|
|
|
.L_last_num_blocks_is_7_1_446:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_446
|
|
jb .L_last_num_blocks_is_3_1_446
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_446
|
|
je .L_last_num_blocks_is_6_446
|
|
jmp .L_last_num_blocks_is_5_446
|
|
|
|
.L_last_num_blocks_is_3_1_446:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_446
|
|
je .L_last_num_blocks_is_2_446
|
|
.L_last_num_blocks_is_1_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_447
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_447
|
|
|
|
.L_16_blocks_overflow_447:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_447:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_448
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_448
|
|
.L_small_initial_partial_block_448:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_448
|
|
.L_small_initial_compute_done_448:
|
|
.L_after_reduction_448:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_2_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_449
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_449
|
|
|
|
.L_16_blocks_overflow_449:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_449:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_450
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_450
|
|
.L_small_initial_partial_block_450:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_450:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_450
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_450:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_3_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_451
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_451
|
|
|
|
.L_16_blocks_overflow_451:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_451:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_452
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_452
|
|
.L_small_initial_partial_block_452:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_452:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_452
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_452:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_4_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_453
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_453
|
|
|
|
.L_16_blocks_overflow_453:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_453:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_454
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_454
|
|
.L_small_initial_partial_block_454:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_454:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_454
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_454:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_5_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_455
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_455
|
|
|
|
.L_16_blocks_overflow_455:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_455:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %xmm29,%xmm3,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_456
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_456
|
|
.L_small_initial_partial_block_456:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_456:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_456
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_456:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_6_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_457
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_457
|
|
|
|
.L_16_blocks_overflow_457:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_457:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %ymm29,%ymm3,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_458
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_458
|
|
.L_small_initial_partial_block_458:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_458:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_458
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_458:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_7_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_459
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_459
|
|
|
|
.L_16_blocks_overflow_459:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_459:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_460
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_460
|
|
.L_small_initial_partial_block_460:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_460:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_460
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_460:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_8_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_461
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_461
|
|
|
|
.L_16_blocks_overflow_461:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_461:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_462
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_462
|
|
.L_small_initial_partial_block_462:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_462:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_462
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_462:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_9_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_463
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_463
|
|
|
|
.L_16_blocks_overflow_463:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_463:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %xmm29,%xmm4,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_464
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_464
|
|
.L_small_initial_partial_block_464:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_464:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_464
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_464:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_10_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_465
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_465
|
|
|
|
.L_16_blocks_overflow_465:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_465:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %ymm29,%ymm4,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_466
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_466
|
|
.L_small_initial_partial_block_466:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_466:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_466
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_466:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_11_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_467
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_467
|
|
|
|
.L_16_blocks_overflow_467:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_467:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_468
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_468
|
|
.L_small_initial_partial_block_468:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_468:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_468
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_468:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_12_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_469
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_469
|
|
|
|
.L_16_blocks_overflow_469:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_469:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_470
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_470
|
|
.L_small_initial_partial_block_470:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_470:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_470
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_470:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_13_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_471
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_471
|
|
|
|
.L_16_blocks_overflow_471:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_471:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %xmm29,%xmm5,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_472
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_472
|
|
.L_small_initial_partial_block_472:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_472:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_472
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_472:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_14_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_473
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_473
|
|
|
|
.L_16_blocks_overflow_473:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_473:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %ymm29,%ymm5,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_474
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_474
|
|
.L_small_initial_partial_block_474:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_474:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_474
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_474:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_15_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_475
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_475
|
|
|
|
.L_16_blocks_overflow_475:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_475:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_476
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_476
|
|
.L_small_initial_partial_block_476:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_476:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_476
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_476:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_16_446:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_477
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_477
|
|
|
|
.L_16_blocks_overflow_477:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_477:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm17
|
|
vpshufb %zmm29,%zmm3,%zmm19
|
|
vpshufb %zmm29,%zmm4,%zmm20
|
|
vpshufb %zmm29,%zmm5,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_478:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_478:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_478:
|
|
jmp .L_last_blocks_done_446
|
|
.L_last_num_blocks_is_0_446:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_446:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_334
|
|
|
|
.L_message_below_equal_16_blocks_334:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_479
|
|
jl .L_small_initial_num_blocks_is_7_1_479
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_479
|
|
jl .L_small_initial_num_blocks_is_11_9_479
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_479
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_479
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_479
|
|
jmp .L_small_initial_num_blocks_is_13_479
|
|
|
|
.L_small_initial_num_blocks_is_11_9_479:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_479
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_479
|
|
jmp .L_small_initial_num_blocks_is_9_479
|
|
|
|
.L_small_initial_num_blocks_is_7_1_479:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_479
|
|
jl .L_small_initial_num_blocks_is_3_1_479
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_479
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_479
|
|
jmp .L_small_initial_num_blocks_is_5_479
|
|
|
|
.L_small_initial_num_blocks_is_3_1_479:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_479
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_479
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm0,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_480
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_480
|
|
.L_small_initial_partial_block_480:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_480
|
|
.L_small_initial_compute_done_480:
|
|
.L_after_reduction_480:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_2_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm0,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_481
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_481
|
|
.L_small_initial_partial_block_481:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_481:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_481
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_481:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_3_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_482
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_482
|
|
.L_small_initial_partial_block_482:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_482:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_482
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_482:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_4_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_483
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_483
|
|
.L_small_initial_partial_block_483:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_483:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_483
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_483:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_5_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %xmm29,%xmm3,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_484
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_484
|
|
.L_small_initial_partial_block_484:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_484:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_484
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_484:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_6_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %ymm29,%ymm3,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_485
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_485
|
|
.L_small_initial_partial_block_485:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_485:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_485
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_485:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_7_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_486
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_486
|
|
.L_small_initial_partial_block_486:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_486:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_486
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_486:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_8_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_487
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_487
|
|
.L_small_initial_partial_block_487:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_487:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_487
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_487:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_9_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %xmm29,%xmm4,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_488
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_488
|
|
.L_small_initial_partial_block_488:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_488:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_488
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_488:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_10_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %ymm29,%ymm4,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_489
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_489
|
|
.L_small_initial_partial_block_489:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_489:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_489
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_489:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_11_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_490
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_490
|
|
.L_small_initial_partial_block_490:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_490:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_490
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_490:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_12_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_491
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_491
|
|
.L_small_initial_partial_block_491:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_491:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_491
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_491:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_13_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %xmm29,%xmm5,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_492
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_492
|
|
.L_small_initial_partial_block_492:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_492:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_492
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_492:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_14_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %ymm29,%ymm5,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_493
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_493
|
|
.L_small_initial_partial_block_493:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_493:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_493
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_493:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_15_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_494
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_494
|
|
.L_small_initial_partial_block_494:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_494:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_494
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_494:
|
|
jmp .L_small_initial_blocks_encrypted_479
|
|
.L_small_initial_num_blocks_is_16_479:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm0,%zmm6
|
|
vpshufb %zmm29,%zmm3,%zmm7
|
|
vpshufb %zmm29,%zmm4,%zmm10
|
|
vpshufb %zmm29,%zmm5,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_495:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_495:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_495:
|
|
.L_small_initial_blocks_encrypted_479:
|
|
.L_ghash_done_334:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_334:
|
|
jmp .Lexit_gcm_encrypt
|
|
.Lexit_gcm_encrypt:
|
|
cmpq $256,%r8
|
|
jbe .Lskip_hkeys_cleanup_496
|
|
vpxor %xmm0,%xmm0,%xmm0
|
|
vmovdqa64 %zmm0,0(%rsp)
|
|
vmovdqa64 %zmm0,64(%rsp)
|
|
vmovdqa64 %zmm0,128(%rsp)
|
|
vmovdqa64 %zmm0,192(%rsp)
|
|
vmovdqa64 %zmm0,256(%rsp)
|
|
vmovdqa64 %zmm0,320(%rsp)
|
|
vmovdqa64 %zmm0,384(%rsp)
|
|
vmovdqa64 %zmm0,448(%rsp)
|
|
vmovdqa64 %zmm0,512(%rsp)
|
|
vmovdqa64 %zmm0,576(%rsp)
|
|
vmovdqa64 %zmm0,640(%rsp)
|
|
vmovdqa64 %zmm0,704(%rsp)
|
|
.Lskip_hkeys_cleanup_496:
|
|
vzeroupper
|
|
leaq (%rbp),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
popq %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r15
|
|
popq %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r14
|
|
popq %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r13
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
popq %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbp
|
|
popq %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbx
|
|
.byte 0xf3,0xc3
|
|
.Lencrypt_seh_end:
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
|
|
.globl ossl_aes_gcm_decrypt_avx512
|
|
.type ossl_aes_gcm_decrypt_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_decrypt_avx512:
|
|
.cfi_startproc
|
|
.Ldecrypt_seh_begin:
|
|
.byte 243,15,30,250
|
|
pushq %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbx,-16
|
|
.Ldecrypt_seh_push_rbx:
|
|
pushq %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %rbp,-24
|
|
.Ldecrypt_seh_push_rbp:
|
|
pushq %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r12,-32
|
|
.Ldecrypt_seh_push_r12:
|
|
pushq %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r13,-40
|
|
.Ldecrypt_seh_push_r13:
|
|
pushq %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r14,-48
|
|
.Ldecrypt_seh_push_r14:
|
|
pushq %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
.cfi_offset %r15,-56
|
|
.Ldecrypt_seh_push_r15:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaq 0(%rsp),%rbp
|
|
.cfi_def_cfa_register %rbp
|
|
.Ldecrypt_seh_setfp:
|
|
|
|
.Ldecrypt_seh_prolog_end:
|
|
subq $1588,%rsp
|
|
andq $(-64),%rsp
|
|
|
|
|
|
movl 240(%rdi),%eax
|
|
cmpl $9,%eax
|
|
je .Laes_gcm_decrypt_128_avx512
|
|
cmpl $11,%eax
|
|
je .Laes_gcm_decrypt_192_avx512
|
|
cmpl $13,%eax
|
|
je .Laes_gcm_decrypt_256_avx512
|
|
xorl %eax,%eax
|
|
jmp .Lexit_gcm_decrypt
|
|
.align 32
|
|
.Laes_gcm_decrypt_128_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_497
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_498
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
|
|
vmovdqa64 %xmm0,%xmm6
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_498
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_498:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpand %xmm0,%xmm6,%xmm6
|
|
vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
|
|
vpshufb %xmm5,%xmm6,%xmm6
|
|
vpxorq %xmm6,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_498
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_498
|
|
|
|
.L_partial_incomplete_498:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_498:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_498:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_497
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_497
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_499
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_499
|
|
.L_next_16_overflow_499:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_499:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_500
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_500:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_497
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_501
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_501
|
|
.L_next_16_overflow_501:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_501:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_502
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_502:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_497
|
|
.L_encrypt_big_nblocks_497:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_503
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_503
|
|
.L_16_blocks_overflow_503:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_503:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_504
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_504
|
|
.L_16_blocks_overflow_504:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_504:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_505
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_505
|
|
.L_16_blocks_overflow_505:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_505:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_497
|
|
|
|
.L_no_more_big_nblocks_497:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_497
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_497
|
|
.L_encrypt_0_blocks_ghash_32_497:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_506
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_506
|
|
jb .L_last_num_blocks_is_7_1_506
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_506
|
|
jb .L_last_num_blocks_is_11_9_506
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_506
|
|
ja .L_last_num_blocks_is_16_506
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_506
|
|
jmp .L_last_num_blocks_is_13_506
|
|
|
|
.L_last_num_blocks_is_11_9_506:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_506
|
|
ja .L_last_num_blocks_is_11_506
|
|
jmp .L_last_num_blocks_is_9_506
|
|
|
|
.L_last_num_blocks_is_7_1_506:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_506
|
|
jb .L_last_num_blocks_is_3_1_506
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_506
|
|
je .L_last_num_blocks_is_6_506
|
|
jmp .L_last_num_blocks_is_5_506
|
|
|
|
.L_last_num_blocks_is_3_1_506:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_506
|
|
je .L_last_num_blocks_is_2_506
|
|
.L_last_num_blocks_is_1_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_507
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_507
|
|
|
|
.L_16_blocks_overflow_507:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_507:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_508
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_508
|
|
.L_small_initial_partial_block_508:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_508
|
|
.L_small_initial_compute_done_508:
|
|
.L_after_reduction_508:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_2_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_509
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_509
|
|
|
|
.L_16_blocks_overflow_509:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_509:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_510
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_510
|
|
.L_small_initial_partial_block_510:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_510:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_510
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_510:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_3_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_511
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_511
|
|
|
|
.L_16_blocks_overflow_511:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_511:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_512
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_512
|
|
.L_small_initial_partial_block_512:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_512:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_512
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_512:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_4_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_513
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_513
|
|
|
|
.L_16_blocks_overflow_513:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_513:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_514
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_514
|
|
.L_small_initial_partial_block_514:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_514:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_514
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_514:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_5_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_515
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_515
|
|
|
|
.L_16_blocks_overflow_515:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_515:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_516
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_516
|
|
.L_small_initial_partial_block_516:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_516:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_516
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_516:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_6_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_517
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_517
|
|
|
|
.L_16_blocks_overflow_517:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_517:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_518
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_518
|
|
.L_small_initial_partial_block_518:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_518:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_518
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_518:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_7_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_519
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_519
|
|
|
|
.L_16_blocks_overflow_519:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_519:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_520
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_520
|
|
.L_small_initial_partial_block_520:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_520:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_520
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_520:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_8_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_521
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_521
|
|
|
|
.L_16_blocks_overflow_521:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_521:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_522
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_522
|
|
.L_small_initial_partial_block_522:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_522:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_522
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_522:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_9_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_523
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_523
|
|
|
|
.L_16_blocks_overflow_523:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_523:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_524
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_524
|
|
.L_small_initial_partial_block_524:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_524:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_524
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_524:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_10_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_525
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_525
|
|
|
|
.L_16_blocks_overflow_525:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_525:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_526
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_526
|
|
.L_small_initial_partial_block_526:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_526:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_526
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_526:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_11_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_527
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_527
|
|
|
|
.L_16_blocks_overflow_527:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_527:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_528
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_528
|
|
.L_small_initial_partial_block_528:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_528:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_528
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_528:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_12_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_529
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_529
|
|
|
|
.L_16_blocks_overflow_529:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_529:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_530
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_530
|
|
.L_small_initial_partial_block_530:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_530:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_530
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_530:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_13_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_531
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_531
|
|
|
|
.L_16_blocks_overflow_531:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_531:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_532
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_532
|
|
.L_small_initial_partial_block_532:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_532:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_532
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_532:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_14_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_533
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_533
|
|
|
|
.L_16_blocks_overflow_533:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_533:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_534
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_534
|
|
.L_small_initial_partial_block_534:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_534:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_534
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_534:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_15_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_535
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_535
|
|
|
|
.L_16_blocks_overflow_535:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_535:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_536
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_536
|
|
.L_small_initial_partial_block_536:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_536:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_536
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_536:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_16_506:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_537
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_537
|
|
|
|
.L_16_blocks_overflow_537:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_537:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_538:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_538:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_538:
|
|
jmp .L_last_blocks_done_506
|
|
.L_last_num_blocks_is_0_506:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_506:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_497
|
|
.L_encrypt_32_blocks_497:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_539
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_539
|
|
.L_16_blocks_overflow_539:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_539:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_540
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_540
|
|
.L_16_blocks_overflow_540:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_540:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_541
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_541
|
|
jb .L_last_num_blocks_is_7_1_541
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_541
|
|
jb .L_last_num_blocks_is_11_9_541
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_541
|
|
ja .L_last_num_blocks_is_16_541
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_541
|
|
jmp .L_last_num_blocks_is_13_541
|
|
|
|
.L_last_num_blocks_is_11_9_541:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_541
|
|
ja .L_last_num_blocks_is_11_541
|
|
jmp .L_last_num_blocks_is_9_541
|
|
|
|
.L_last_num_blocks_is_7_1_541:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_541
|
|
jb .L_last_num_blocks_is_3_1_541
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_541
|
|
je .L_last_num_blocks_is_6_541
|
|
jmp .L_last_num_blocks_is_5_541
|
|
|
|
.L_last_num_blocks_is_3_1_541:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_541
|
|
je .L_last_num_blocks_is_2_541
|
|
.L_last_num_blocks_is_1_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_542
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_542
|
|
|
|
.L_16_blocks_overflow_542:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_542:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_543
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_543
|
|
.L_small_initial_partial_block_543:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_543
|
|
.L_small_initial_compute_done_543:
|
|
.L_after_reduction_543:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_2_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_544
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_544
|
|
|
|
.L_16_blocks_overflow_544:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_544:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_545
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_545
|
|
.L_small_initial_partial_block_545:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_545:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_545
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_545:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_3_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_546
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_546
|
|
|
|
.L_16_blocks_overflow_546:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_546:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_547
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_547
|
|
.L_small_initial_partial_block_547:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_547:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_547
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_547:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_4_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_548
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_548
|
|
|
|
.L_16_blocks_overflow_548:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_548:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_549
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_549
|
|
.L_small_initial_partial_block_549:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_549:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_549
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_549:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_5_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_550
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_550
|
|
|
|
.L_16_blocks_overflow_550:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_550:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_551
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_551
|
|
.L_small_initial_partial_block_551:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_551:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_551
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_551:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_6_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_552
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_552
|
|
|
|
.L_16_blocks_overflow_552:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_552:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_553
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_553
|
|
.L_small_initial_partial_block_553:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_553:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_553
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_553:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_7_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_554
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_554
|
|
|
|
.L_16_blocks_overflow_554:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_554:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_555
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_555
|
|
.L_small_initial_partial_block_555:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_555:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_555
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_555:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_8_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_556
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_556
|
|
|
|
.L_16_blocks_overflow_556:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_556:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_557
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_557
|
|
.L_small_initial_partial_block_557:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_557:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_557
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_557:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_9_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_558
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_558
|
|
|
|
.L_16_blocks_overflow_558:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_558:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_559
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_559
|
|
.L_small_initial_partial_block_559:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_559:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_559
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_559:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_10_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_560
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_560
|
|
|
|
.L_16_blocks_overflow_560:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_560:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_561
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_561
|
|
.L_small_initial_partial_block_561:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_561:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_561
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_561:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_11_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_562
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_562
|
|
|
|
.L_16_blocks_overflow_562:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_562:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_563
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_563
|
|
.L_small_initial_partial_block_563:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_563:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_563
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_563:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_12_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_564
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_564
|
|
|
|
.L_16_blocks_overflow_564:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_564:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_565
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_565
|
|
.L_small_initial_partial_block_565:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_565:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_565
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_565:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_13_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_566
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_566
|
|
|
|
.L_16_blocks_overflow_566:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_566:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_567
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_567
|
|
.L_small_initial_partial_block_567:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_567:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_567
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_567:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_14_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_568
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_568
|
|
|
|
.L_16_blocks_overflow_568:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_568:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_569
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_569
|
|
.L_small_initial_partial_block_569:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_569:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_569
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_569:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_15_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_570
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_570
|
|
|
|
.L_16_blocks_overflow_570:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_570:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_571
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_571
|
|
.L_small_initial_partial_block_571:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_571:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_571
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_571:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_16_541:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_572
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_572
|
|
|
|
.L_16_blocks_overflow_572:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_572:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_573:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_573:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_573:
|
|
jmp .L_last_blocks_done_541
|
|
.L_last_num_blocks_is_0_541:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_541:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_497
|
|
.L_encrypt_16_blocks_497:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_574
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_574
|
|
.L_16_blocks_overflow_574:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_574:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_575
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_575
|
|
jb .L_last_num_blocks_is_7_1_575
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_575
|
|
jb .L_last_num_blocks_is_11_9_575
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_575
|
|
ja .L_last_num_blocks_is_16_575
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_575
|
|
jmp .L_last_num_blocks_is_13_575
|
|
|
|
.L_last_num_blocks_is_11_9_575:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_575
|
|
ja .L_last_num_blocks_is_11_575
|
|
jmp .L_last_num_blocks_is_9_575
|
|
|
|
.L_last_num_blocks_is_7_1_575:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_575
|
|
jb .L_last_num_blocks_is_3_1_575
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_575
|
|
je .L_last_num_blocks_is_6_575
|
|
jmp .L_last_num_blocks_is_5_575
|
|
|
|
.L_last_num_blocks_is_3_1_575:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_575
|
|
je .L_last_num_blocks_is_2_575
|
|
.L_last_num_blocks_is_1_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_576
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_576
|
|
|
|
.L_16_blocks_overflow_576:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_576:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_577
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_577
|
|
.L_small_initial_partial_block_577:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_577
|
|
.L_small_initial_compute_done_577:
|
|
.L_after_reduction_577:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_2_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_578
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_578
|
|
|
|
.L_16_blocks_overflow_578:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_578:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_579
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_579
|
|
.L_small_initial_partial_block_579:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_579:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_579
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_579:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_3_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_580
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_580
|
|
|
|
.L_16_blocks_overflow_580:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_580:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_581
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_581
|
|
.L_small_initial_partial_block_581:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_581:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_581
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_581:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_4_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_582
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_582
|
|
|
|
.L_16_blocks_overflow_582:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_582:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_583
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_583
|
|
.L_small_initial_partial_block_583:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_583:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_583
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_583:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_5_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_584
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_584
|
|
|
|
.L_16_blocks_overflow_584:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_584:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_585
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_585
|
|
.L_small_initial_partial_block_585:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_585:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_585
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_585:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_6_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_586
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_586
|
|
|
|
.L_16_blocks_overflow_586:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_586:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_587
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_587
|
|
.L_small_initial_partial_block_587:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_587:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_587
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_587:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_7_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_588
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_588
|
|
|
|
.L_16_blocks_overflow_588:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_588:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_589
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_589
|
|
.L_small_initial_partial_block_589:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_589:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_589
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_589:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_8_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_590
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_590
|
|
|
|
.L_16_blocks_overflow_590:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_590:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_591
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_591
|
|
.L_small_initial_partial_block_591:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_591:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_591
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_591:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_9_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_592
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_592
|
|
|
|
.L_16_blocks_overflow_592:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_592:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_593
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_593
|
|
.L_small_initial_partial_block_593:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_593:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_593
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_593:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_10_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_594
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_594
|
|
|
|
.L_16_blocks_overflow_594:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_594:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_595
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_595
|
|
.L_small_initial_partial_block_595:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_595:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_595
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_595:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_11_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_596
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_596
|
|
|
|
.L_16_blocks_overflow_596:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_596:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_597
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_597
|
|
.L_small_initial_partial_block_597:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_597:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_597
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_597:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_12_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_598
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_598
|
|
|
|
.L_16_blocks_overflow_598:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_598:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_599
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_599
|
|
.L_small_initial_partial_block_599:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_599:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_599
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_599:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_13_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_600
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_600
|
|
|
|
.L_16_blocks_overflow_600:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_600:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_601
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_601
|
|
.L_small_initial_partial_block_601:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_601:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_601
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_601:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_14_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_602
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_602
|
|
|
|
.L_16_blocks_overflow_602:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_602:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_603
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_603
|
|
.L_small_initial_partial_block_603:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_603:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_603
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_603:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_15_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_604
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_604
|
|
|
|
.L_16_blocks_overflow_604:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_604:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_605
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_605
|
|
.L_small_initial_partial_block_605:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_605:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_605
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_605:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_16_575:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_606
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_606
|
|
|
|
.L_16_blocks_overflow_606:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_606:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_607:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_607:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_607:
|
|
jmp .L_last_blocks_done_575
|
|
.L_last_num_blocks_is_0_575:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_575:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_497
|
|
|
|
.L_message_below_32_blocks_497:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_608
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_608:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_609
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_609
|
|
jb .L_last_num_blocks_is_7_1_609
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_609
|
|
jb .L_last_num_blocks_is_11_9_609
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_609
|
|
ja .L_last_num_blocks_is_16_609
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_609
|
|
jmp .L_last_num_blocks_is_13_609
|
|
|
|
.L_last_num_blocks_is_11_9_609:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_609
|
|
ja .L_last_num_blocks_is_11_609
|
|
jmp .L_last_num_blocks_is_9_609
|
|
|
|
.L_last_num_blocks_is_7_1_609:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_609
|
|
jb .L_last_num_blocks_is_3_1_609
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_609
|
|
je .L_last_num_blocks_is_6_609
|
|
jmp .L_last_num_blocks_is_5_609
|
|
|
|
.L_last_num_blocks_is_3_1_609:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_609
|
|
je .L_last_num_blocks_is_2_609
|
|
.L_last_num_blocks_is_1_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_610
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_610
|
|
|
|
.L_16_blocks_overflow_610:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_610:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_611
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_611
|
|
.L_small_initial_partial_block_611:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_611
|
|
.L_small_initial_compute_done_611:
|
|
.L_after_reduction_611:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_2_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_612
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_612
|
|
|
|
.L_16_blocks_overflow_612:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_612:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_613
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_613
|
|
.L_small_initial_partial_block_613:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_613:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_613
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_613:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_3_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_614
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_614
|
|
|
|
.L_16_blocks_overflow_614:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_614:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_615
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_615
|
|
.L_small_initial_partial_block_615:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_615:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_615
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_615:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_4_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_616
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_616
|
|
|
|
.L_16_blocks_overflow_616:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_616:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_617
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_617
|
|
.L_small_initial_partial_block_617:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_617:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_617
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_617:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_5_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_618
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_618
|
|
|
|
.L_16_blocks_overflow_618:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_618:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_619
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_619
|
|
.L_small_initial_partial_block_619:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_619:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_619
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_619:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_6_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_620
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_620
|
|
|
|
.L_16_blocks_overflow_620:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_620:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_621
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_621
|
|
.L_small_initial_partial_block_621:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_621:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_621
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_621:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_7_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_622
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_622
|
|
|
|
.L_16_blocks_overflow_622:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_622:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_623
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_623
|
|
.L_small_initial_partial_block_623:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_623:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_623
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_623:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_8_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_624
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_624
|
|
|
|
.L_16_blocks_overflow_624:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_624:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_625
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_625
|
|
.L_small_initial_partial_block_625:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_625:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_625
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_625:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_9_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_626
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_626
|
|
|
|
.L_16_blocks_overflow_626:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_626:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_627
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_627
|
|
.L_small_initial_partial_block_627:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_627:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_627
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_627:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_10_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_628
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_628
|
|
|
|
.L_16_blocks_overflow_628:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_628:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_629
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_629
|
|
.L_small_initial_partial_block_629:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_629:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_629
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_629:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_11_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_630
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_630
|
|
|
|
.L_16_blocks_overflow_630:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_630:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_631
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_631
|
|
.L_small_initial_partial_block_631:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_631:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_631
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_631:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_12_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_632
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_632
|
|
|
|
.L_16_blocks_overflow_632:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_632:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_633
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_633
|
|
.L_small_initial_partial_block_633:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_633:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_633
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_633:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_13_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_634
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_634
|
|
|
|
.L_16_blocks_overflow_634:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_634:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_635
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_635
|
|
.L_small_initial_partial_block_635:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_635:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_635
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_635:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_14_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_636
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_636
|
|
|
|
.L_16_blocks_overflow_636:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_636:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_637
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_637
|
|
.L_small_initial_partial_block_637:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_637:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_637
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_637:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_15_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_638
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_638
|
|
|
|
.L_16_blocks_overflow_638:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_638:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_639
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_639
|
|
.L_small_initial_partial_block_639:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_639:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_639
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_639:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_16_609:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_640
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_640
|
|
|
|
.L_16_blocks_overflow_640:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_640:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_641:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_641:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_641:
|
|
jmp .L_last_blocks_done_609
|
|
.L_last_num_blocks_is_0_609:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_609:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_497
|
|
|
|
.L_message_below_equal_16_blocks_497:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_642
|
|
jl .L_small_initial_num_blocks_is_7_1_642
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_642
|
|
jl .L_small_initial_num_blocks_is_11_9_642
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_642
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_642
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_642
|
|
jmp .L_small_initial_num_blocks_is_13_642
|
|
|
|
.L_small_initial_num_blocks_is_11_9_642:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_642
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_642
|
|
jmp .L_small_initial_num_blocks_is_9_642
|
|
|
|
.L_small_initial_num_blocks_is_7_1_642:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_642
|
|
jl .L_small_initial_num_blocks_is_3_1_642
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_642
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_642
|
|
jmp .L_small_initial_num_blocks_is_5_642
|
|
|
|
.L_small_initial_num_blocks_is_3_1_642:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_642
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_642
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm6,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_643
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_643
|
|
.L_small_initial_partial_block_643:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_643
|
|
.L_small_initial_compute_done_643:
|
|
.L_after_reduction_643:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_2_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm6,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_644
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_644
|
|
.L_small_initial_partial_block_644:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_644:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_644
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_644:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_3_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_645
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_645
|
|
.L_small_initial_partial_block_645:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_645:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_645
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_645:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_4_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_646
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_646
|
|
.L_small_initial_partial_block_646:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_646:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_646
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_646:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_5_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %xmm29,%xmm7,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_647
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_647
|
|
.L_small_initial_partial_block_647:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_647:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_647
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_647:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_6_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %ymm29,%ymm7,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_648
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_648
|
|
.L_small_initial_partial_block_648:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_648:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_648
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_648:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_7_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_649
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_649
|
|
.L_small_initial_partial_block_649:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_649:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_649
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_649:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_8_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_650
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_650
|
|
.L_small_initial_partial_block_650:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_650:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_650
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_650:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_9_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %xmm29,%xmm10,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_651
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_651
|
|
.L_small_initial_partial_block_651:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_651:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_651
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_651:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_10_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %ymm29,%ymm10,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_652
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_652
|
|
.L_small_initial_partial_block_652:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_652:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_652
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_652:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_11_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_653
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_653
|
|
.L_small_initial_partial_block_653:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_653:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_653
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_653:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_12_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_654
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_654
|
|
.L_small_initial_partial_block_654:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_654:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_654
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_654:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_13_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %xmm29,%xmm11,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_655
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_655
|
|
.L_small_initial_partial_block_655:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_655:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_655
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_655:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_14_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %ymm29,%ymm11,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_656
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_656
|
|
.L_small_initial_partial_block_656:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_656:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_656
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_656:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_15_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_657
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_657
|
|
.L_small_initial_partial_block_657:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_657:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_657
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_657:
|
|
jmp .L_small_initial_blocks_encrypted_642
|
|
.L_small_initial_num_blocks_is_16_642:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_658:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_658:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_658:
|
|
.L_small_initial_blocks_encrypted_642:
|
|
.L_ghash_done_497:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_497:
|
|
jmp .Lexit_gcm_decrypt
|
|
.align 32
|
|
.Laes_gcm_decrypt_192_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_659
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_660
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
|
|
vmovdqa64 %xmm0,%xmm6
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_660
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_660:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpand %xmm0,%xmm6,%xmm6
|
|
vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
|
|
vpshufb %xmm5,%xmm6,%xmm6
|
|
vpxorq %xmm6,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_660
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_660
|
|
|
|
.L_partial_incomplete_660:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_660:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_660:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_659
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_659
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_661
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_661
|
|
.L_next_16_overflow_661:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_661:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_662
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_662:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_659
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_663
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_663
|
|
.L_next_16_overflow_663:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_663:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_664
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_664:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_659
|
|
.L_encrypt_big_nblocks_659:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_665
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_665
|
|
.L_16_blocks_overflow_665:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_665:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_666
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_666
|
|
.L_16_blocks_overflow_666:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_666:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_667
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_667
|
|
.L_16_blocks_overflow_667:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_667:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_659
|
|
|
|
.L_no_more_big_nblocks_659:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_659
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_659
|
|
.L_encrypt_0_blocks_ghash_32_659:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_668
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_668
|
|
jb .L_last_num_blocks_is_7_1_668
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_668
|
|
jb .L_last_num_blocks_is_11_9_668
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_668
|
|
ja .L_last_num_blocks_is_16_668
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_668
|
|
jmp .L_last_num_blocks_is_13_668
|
|
|
|
.L_last_num_blocks_is_11_9_668:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_668
|
|
ja .L_last_num_blocks_is_11_668
|
|
jmp .L_last_num_blocks_is_9_668
|
|
|
|
.L_last_num_blocks_is_7_1_668:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_668
|
|
jb .L_last_num_blocks_is_3_1_668
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_668
|
|
je .L_last_num_blocks_is_6_668
|
|
jmp .L_last_num_blocks_is_5_668
|
|
|
|
.L_last_num_blocks_is_3_1_668:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_668
|
|
je .L_last_num_blocks_is_2_668
|
|
.L_last_num_blocks_is_1_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_669
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_669
|
|
|
|
.L_16_blocks_overflow_669:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_669:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_670
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_670
|
|
.L_small_initial_partial_block_670:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_670
|
|
.L_small_initial_compute_done_670:
|
|
.L_after_reduction_670:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_2_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_671
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_671
|
|
|
|
.L_16_blocks_overflow_671:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_671:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_672
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_672
|
|
.L_small_initial_partial_block_672:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_672:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_672
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_672:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_3_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_673
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_673
|
|
|
|
.L_16_blocks_overflow_673:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_673:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_674
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_674
|
|
.L_small_initial_partial_block_674:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_674:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_674
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_674:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_4_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_675
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_675
|
|
|
|
.L_16_blocks_overflow_675:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_675:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_676
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_676
|
|
.L_small_initial_partial_block_676:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_676:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_676
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_676:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_5_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_677
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_677
|
|
|
|
.L_16_blocks_overflow_677:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_677:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_678
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_678
|
|
.L_small_initial_partial_block_678:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_678:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_678
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_678:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_6_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_679
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_679
|
|
|
|
.L_16_blocks_overflow_679:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_679:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_680
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_680
|
|
.L_small_initial_partial_block_680:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_680:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_680
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_680:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_7_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_681
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_681
|
|
|
|
.L_16_blocks_overflow_681:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_681:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_682
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_682
|
|
.L_small_initial_partial_block_682:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_682:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_682
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_682:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_8_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_683
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_683
|
|
|
|
.L_16_blocks_overflow_683:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_683:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_684
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_684
|
|
.L_small_initial_partial_block_684:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_684:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_684
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_684:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_9_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_685
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_685
|
|
|
|
.L_16_blocks_overflow_685:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_685:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_686
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_686
|
|
.L_small_initial_partial_block_686:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_686:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_686
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_686:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_10_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_687
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_687
|
|
|
|
.L_16_blocks_overflow_687:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_687:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_688
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_688
|
|
.L_small_initial_partial_block_688:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_688:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_688
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_688:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_11_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_689
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_689
|
|
|
|
.L_16_blocks_overflow_689:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_689:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_690
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_690
|
|
.L_small_initial_partial_block_690:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_690:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_690
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_690:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_12_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_691
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_691
|
|
|
|
.L_16_blocks_overflow_691:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_691:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_692
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_692
|
|
.L_small_initial_partial_block_692:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_692:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_692
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_692:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_13_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_693
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_693
|
|
|
|
.L_16_blocks_overflow_693:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_693:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_694
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_694
|
|
.L_small_initial_partial_block_694:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_694:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_694
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_694:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_14_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_695
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_695
|
|
|
|
.L_16_blocks_overflow_695:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_695:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_696
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_696
|
|
.L_small_initial_partial_block_696:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_696:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_696
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_696:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_15_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_697
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_697
|
|
|
|
.L_16_blocks_overflow_697:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_697:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_698
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_698
|
|
.L_small_initial_partial_block_698:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_698:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_698
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_698:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_16_668:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_699
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_699
|
|
|
|
.L_16_blocks_overflow_699:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_699:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_700:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_700:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_700:
|
|
jmp .L_last_blocks_done_668
|
|
.L_last_num_blocks_is_0_668:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_668:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_659
|
|
.L_encrypt_32_blocks_659:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_701
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_701
|
|
.L_16_blocks_overflow_701:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_701:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_702
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_702
|
|
.L_16_blocks_overflow_702:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_702:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_703
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_703
|
|
jb .L_last_num_blocks_is_7_1_703
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_703
|
|
jb .L_last_num_blocks_is_11_9_703
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_703
|
|
ja .L_last_num_blocks_is_16_703
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_703
|
|
jmp .L_last_num_blocks_is_13_703
|
|
|
|
.L_last_num_blocks_is_11_9_703:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_703
|
|
ja .L_last_num_blocks_is_11_703
|
|
jmp .L_last_num_blocks_is_9_703
|
|
|
|
.L_last_num_blocks_is_7_1_703:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_703
|
|
jb .L_last_num_blocks_is_3_1_703
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_703
|
|
je .L_last_num_blocks_is_6_703
|
|
jmp .L_last_num_blocks_is_5_703
|
|
|
|
.L_last_num_blocks_is_3_1_703:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_703
|
|
je .L_last_num_blocks_is_2_703
|
|
.L_last_num_blocks_is_1_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_704
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_704
|
|
|
|
.L_16_blocks_overflow_704:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_704:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_705
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_705
|
|
.L_small_initial_partial_block_705:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_705
|
|
.L_small_initial_compute_done_705:
|
|
.L_after_reduction_705:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_2_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_706
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_706
|
|
|
|
.L_16_blocks_overflow_706:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_706:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_707
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_707
|
|
.L_small_initial_partial_block_707:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_707:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_707
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_707:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_3_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_708
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_708
|
|
|
|
.L_16_blocks_overflow_708:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_708:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_709
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_709
|
|
.L_small_initial_partial_block_709:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_709:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_709
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_709:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_4_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_710
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_710
|
|
|
|
.L_16_blocks_overflow_710:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_710:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_711
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_711
|
|
.L_small_initial_partial_block_711:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_711:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_711
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_711:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_5_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_712
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_712
|
|
|
|
.L_16_blocks_overflow_712:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_712:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_713
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_713
|
|
.L_small_initial_partial_block_713:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_713:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_713
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_713:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_6_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_714
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_714
|
|
|
|
.L_16_blocks_overflow_714:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_714:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_715
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_715
|
|
.L_small_initial_partial_block_715:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_715:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_715
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_715:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_7_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_716
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_716
|
|
|
|
.L_16_blocks_overflow_716:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_716:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_717
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_717
|
|
.L_small_initial_partial_block_717:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_717:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_717
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_717:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_8_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_718
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_718
|
|
|
|
.L_16_blocks_overflow_718:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_718:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_719
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_719
|
|
.L_small_initial_partial_block_719:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_719:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_719
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_719:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_9_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_720
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_720
|
|
|
|
.L_16_blocks_overflow_720:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_720:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_721
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_721
|
|
.L_small_initial_partial_block_721:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_721:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_721
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_721:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_10_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_722
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_722
|
|
|
|
.L_16_blocks_overflow_722:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_722:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_723
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_723
|
|
.L_small_initial_partial_block_723:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_723:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_723
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_723:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_11_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_724
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_724
|
|
|
|
.L_16_blocks_overflow_724:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_724:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_725
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_725
|
|
.L_small_initial_partial_block_725:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_725:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_725
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_725:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_12_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_726
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_726
|
|
|
|
.L_16_blocks_overflow_726:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_726:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_727
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_727
|
|
.L_small_initial_partial_block_727:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_727:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_727
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_727:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_13_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_728
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_728
|
|
|
|
.L_16_blocks_overflow_728:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_728:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_729
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_729
|
|
.L_small_initial_partial_block_729:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_729:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_729
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_729:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_14_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_730
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_730
|
|
|
|
.L_16_blocks_overflow_730:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_730:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_731
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_731
|
|
.L_small_initial_partial_block_731:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_731:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_731
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_731:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_15_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_732
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_732
|
|
|
|
.L_16_blocks_overflow_732:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_732:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_733
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_733
|
|
.L_small_initial_partial_block_733:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_733:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_733
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_733:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_16_703:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_734
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_734
|
|
|
|
.L_16_blocks_overflow_734:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_734:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_735:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_735:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_735:
|
|
jmp .L_last_blocks_done_703
|
|
.L_last_num_blocks_is_0_703:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_703:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_659
|
|
.L_encrypt_16_blocks_659:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_736
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_736
|
|
.L_16_blocks_overflow_736:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_736:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_737
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_737
|
|
jb .L_last_num_blocks_is_7_1_737
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_737
|
|
jb .L_last_num_blocks_is_11_9_737
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_737
|
|
ja .L_last_num_blocks_is_16_737
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_737
|
|
jmp .L_last_num_blocks_is_13_737
|
|
|
|
.L_last_num_blocks_is_11_9_737:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_737
|
|
ja .L_last_num_blocks_is_11_737
|
|
jmp .L_last_num_blocks_is_9_737
|
|
|
|
.L_last_num_blocks_is_7_1_737:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_737
|
|
jb .L_last_num_blocks_is_3_1_737
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_737
|
|
je .L_last_num_blocks_is_6_737
|
|
jmp .L_last_num_blocks_is_5_737
|
|
|
|
.L_last_num_blocks_is_3_1_737:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_737
|
|
je .L_last_num_blocks_is_2_737
|
|
.L_last_num_blocks_is_1_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_738
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_738
|
|
|
|
.L_16_blocks_overflow_738:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_738:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_739
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_739
|
|
.L_small_initial_partial_block_739:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_739
|
|
.L_small_initial_compute_done_739:
|
|
.L_after_reduction_739:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_2_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_740
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_740
|
|
|
|
.L_16_blocks_overflow_740:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_740:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_741
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_741
|
|
.L_small_initial_partial_block_741:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_741:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_741
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_741:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_3_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_742
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_742
|
|
|
|
.L_16_blocks_overflow_742:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_742:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_743
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_743
|
|
.L_small_initial_partial_block_743:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_743:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_743
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_743:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_4_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_744
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_744
|
|
|
|
.L_16_blocks_overflow_744:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_744:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_745
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_745
|
|
.L_small_initial_partial_block_745:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_745:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_745
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_745:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_5_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_746
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_746
|
|
|
|
.L_16_blocks_overflow_746:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_746:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_747
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_747
|
|
.L_small_initial_partial_block_747:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_747:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_747
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_747:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_6_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_748
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_748
|
|
|
|
.L_16_blocks_overflow_748:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_748:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_749
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_749
|
|
.L_small_initial_partial_block_749:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_749:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_749
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_749:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_7_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_750
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_750
|
|
|
|
.L_16_blocks_overflow_750:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_750:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_751
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_751
|
|
.L_small_initial_partial_block_751:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_751:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_751
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_751:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_8_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_752
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_752
|
|
|
|
.L_16_blocks_overflow_752:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_752:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_753
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_753
|
|
.L_small_initial_partial_block_753:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_753:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_753
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_753:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_9_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_754
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_754
|
|
|
|
.L_16_blocks_overflow_754:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_754:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_755
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_755
|
|
.L_small_initial_partial_block_755:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_755:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_755
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_755:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_10_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_756
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_756
|
|
|
|
.L_16_blocks_overflow_756:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_756:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_757
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_757
|
|
.L_small_initial_partial_block_757:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_757:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_757
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_757:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_11_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_758
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_758
|
|
|
|
.L_16_blocks_overflow_758:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_758:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_759
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_759
|
|
.L_small_initial_partial_block_759:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_759:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_759
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_759:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_12_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_760
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_760
|
|
|
|
.L_16_blocks_overflow_760:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_760:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_761
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_761
|
|
.L_small_initial_partial_block_761:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_761:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_761
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_761:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_13_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_762
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_762
|
|
|
|
.L_16_blocks_overflow_762:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_762:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_763
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_763
|
|
.L_small_initial_partial_block_763:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_763:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_763
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_763:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_14_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_764
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_764
|
|
|
|
.L_16_blocks_overflow_764:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_764:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_765
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_765
|
|
.L_small_initial_partial_block_765:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_765:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_765
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_765:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_15_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_766
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_766
|
|
|
|
.L_16_blocks_overflow_766:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_766:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_767
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_767
|
|
.L_small_initial_partial_block_767:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_767:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_767
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_767:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_16_737:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_768
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_768
|
|
|
|
.L_16_blocks_overflow_768:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_768:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_769:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_769:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_769:
|
|
jmp .L_last_blocks_done_737
|
|
.L_last_num_blocks_is_0_737:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_737:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_659
|
|
|
|
.L_message_below_32_blocks_659:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_770
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_770:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_771
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_771
|
|
jb .L_last_num_blocks_is_7_1_771
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_771
|
|
jb .L_last_num_blocks_is_11_9_771
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_771
|
|
ja .L_last_num_blocks_is_16_771
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_771
|
|
jmp .L_last_num_blocks_is_13_771
|
|
|
|
.L_last_num_blocks_is_11_9_771:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_771
|
|
ja .L_last_num_blocks_is_11_771
|
|
jmp .L_last_num_blocks_is_9_771
|
|
|
|
.L_last_num_blocks_is_7_1_771:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_771
|
|
jb .L_last_num_blocks_is_3_1_771
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_771
|
|
je .L_last_num_blocks_is_6_771
|
|
jmp .L_last_num_blocks_is_5_771
|
|
|
|
.L_last_num_blocks_is_3_1_771:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_771
|
|
je .L_last_num_blocks_is_2_771
|
|
.L_last_num_blocks_is_1_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_772
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_772
|
|
|
|
.L_16_blocks_overflow_772:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_772:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_773
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_773
|
|
.L_small_initial_partial_block_773:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_773
|
|
.L_small_initial_compute_done_773:
|
|
.L_after_reduction_773:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_2_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_774
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_774
|
|
|
|
.L_16_blocks_overflow_774:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_774:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_775
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_775
|
|
.L_small_initial_partial_block_775:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_775:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_775
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_775:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_3_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_776
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_776
|
|
|
|
.L_16_blocks_overflow_776:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_776:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_777
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_777
|
|
.L_small_initial_partial_block_777:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_777:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_777
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_777:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_4_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_778
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_778
|
|
|
|
.L_16_blocks_overflow_778:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_778:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_779
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_779
|
|
.L_small_initial_partial_block_779:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_779:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_779
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_779:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_5_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_780
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_780
|
|
|
|
.L_16_blocks_overflow_780:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_780:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_781
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_781
|
|
.L_small_initial_partial_block_781:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_781:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_781
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_781:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_6_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_782
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_782
|
|
|
|
.L_16_blocks_overflow_782:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_782:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_783
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_783
|
|
.L_small_initial_partial_block_783:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_783:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_783
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_783:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_7_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_784
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_784
|
|
|
|
.L_16_blocks_overflow_784:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_784:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_785
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_785
|
|
.L_small_initial_partial_block_785:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_785:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_785
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_785:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_8_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_786
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_786
|
|
|
|
.L_16_blocks_overflow_786:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_786:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_787
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_787
|
|
.L_small_initial_partial_block_787:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_787:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_787
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_787:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_9_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_788
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_788
|
|
|
|
.L_16_blocks_overflow_788:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_788:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_789
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_789
|
|
.L_small_initial_partial_block_789:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_789:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_789
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_789:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_10_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_790
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_790
|
|
|
|
.L_16_blocks_overflow_790:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_790:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_791
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_791
|
|
.L_small_initial_partial_block_791:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_791:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_791
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_791:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_11_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_792
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_792
|
|
|
|
.L_16_blocks_overflow_792:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_792:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_793
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_793
|
|
.L_small_initial_partial_block_793:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_793:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_793
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_793:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_12_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_794
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_794
|
|
|
|
.L_16_blocks_overflow_794:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_794:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_795
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_795
|
|
.L_small_initial_partial_block_795:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_795:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_795
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_795:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_13_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_796
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_796
|
|
|
|
.L_16_blocks_overflow_796:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_796:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_797
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_797
|
|
.L_small_initial_partial_block_797:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_797:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_797
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_797:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_14_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_798
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_798
|
|
|
|
.L_16_blocks_overflow_798:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_798:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_799
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_799
|
|
.L_small_initial_partial_block_799:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_799:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_799
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_799:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_15_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_800
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_800
|
|
|
|
.L_16_blocks_overflow_800:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_800:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_801
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_801
|
|
.L_small_initial_partial_block_801:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_801:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_801
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_801:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_16_771:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_802
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_802
|
|
|
|
.L_16_blocks_overflow_802:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_802:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_803:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_803:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_803:
|
|
jmp .L_last_blocks_done_771
|
|
.L_last_num_blocks_is_0_771:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_771:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_659
|
|
|
|
.L_message_below_equal_16_blocks_659:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_804
|
|
jl .L_small_initial_num_blocks_is_7_1_804
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_804
|
|
jl .L_small_initial_num_blocks_is_11_9_804
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_804
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_804
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_804
|
|
jmp .L_small_initial_num_blocks_is_13_804
|
|
|
|
.L_small_initial_num_blocks_is_11_9_804:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_804
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_804
|
|
jmp .L_small_initial_num_blocks_is_9_804
|
|
|
|
.L_small_initial_num_blocks_is_7_1_804:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_804
|
|
jl .L_small_initial_num_blocks_is_3_1_804
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_804
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_804
|
|
jmp .L_small_initial_num_blocks_is_5_804
|
|
|
|
.L_small_initial_num_blocks_is_3_1_804:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_804
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_804
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm6,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_805
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_805
|
|
.L_small_initial_partial_block_805:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_805
|
|
.L_small_initial_compute_done_805:
|
|
.L_after_reduction_805:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_2_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm6,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_806
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_806
|
|
.L_small_initial_partial_block_806:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_806:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_806
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_806:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_3_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_807
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_807
|
|
.L_small_initial_partial_block_807:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_807:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_807
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_807:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_4_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_808
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_808
|
|
.L_small_initial_partial_block_808:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_808:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_808
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_808:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_5_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %xmm29,%xmm7,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_809
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_809
|
|
.L_small_initial_partial_block_809:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_809:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_809
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_809:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_6_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %ymm29,%ymm7,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_810
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_810
|
|
.L_small_initial_partial_block_810:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_810:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_810
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_810:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_7_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_811
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_811
|
|
.L_small_initial_partial_block_811:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_811:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_811
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_811:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_8_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_812
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_812
|
|
.L_small_initial_partial_block_812:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_812:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_812
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_812:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_9_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %xmm29,%xmm10,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_813
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_813
|
|
.L_small_initial_partial_block_813:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_813:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_813
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_813:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_10_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %ymm29,%ymm10,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_814
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_814
|
|
.L_small_initial_partial_block_814:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_814:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_814
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_814:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_11_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_815
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_815
|
|
.L_small_initial_partial_block_815:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_815:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_815
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_815:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_12_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_816
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_816
|
|
.L_small_initial_partial_block_816:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_816:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_816
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_816:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_13_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %xmm29,%xmm11,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_817
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_817
|
|
.L_small_initial_partial_block_817:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_817:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_817
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_817:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_14_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %ymm29,%ymm11,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_818
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_818
|
|
.L_small_initial_partial_block_818:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_818:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_818
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_818:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_15_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_819
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_819
|
|
.L_small_initial_partial_block_819:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_819:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_819
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_819:
|
|
jmp .L_small_initial_blocks_encrypted_804
|
|
.L_small_initial_num_blocks_is_16_804:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_820:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_820:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_820:
|
|
.L_small_initial_blocks_encrypted_804:
|
|
.L_ghash_done_659:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_659:
|
|
jmp .Lexit_gcm_decrypt
|
|
.align 32
|
|
.Laes_gcm_decrypt_256_avx512:
|
|
orq %r8,%r8
|
|
je .L_enc_dec_done_821
|
|
xorq %r14,%r14
|
|
vmovdqu64 64(%rsi),%xmm14
|
|
|
|
movq (%rdx),%r11
|
|
orq %r11,%r11
|
|
je .L_partial_block_done_822
|
|
movl $16,%r10d
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
cmpq %r10,%r8
|
|
cmovcq %r8,%r10
|
|
kmovw (%r12,%r10,2),%k1
|
|
vmovdqu8 (%rcx),%xmm0{%k1}{z}
|
|
|
|
vmovdqu64 16(%rsi),%xmm3
|
|
vmovdqu64 336(%rsi),%xmm4
|
|
|
|
|
|
|
|
leaq SHIFT_MASK(%rip),%r12
|
|
addq %r11,%r12
|
|
vmovdqu64 (%r12),%xmm5
|
|
vpshufb %xmm5,%xmm3,%xmm3
|
|
|
|
vmovdqa64 %xmm0,%xmm6
|
|
vpxorq %xmm0,%xmm3,%xmm3
|
|
|
|
|
|
leaq (%r8,%r11,1),%r13
|
|
subq $16,%r13
|
|
jge .L_no_extra_mask_822
|
|
subq %r13,%r12
|
|
.L_no_extra_mask_822:
|
|
|
|
|
|
|
|
vmovdqu64 16(%r12),%xmm0
|
|
vpand %xmm0,%xmm3,%xmm3
|
|
vpand %xmm0,%xmm6,%xmm6
|
|
vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
|
|
vpshufb %xmm5,%xmm6,%xmm6
|
|
vpxorq %xmm6,%xmm14,%xmm14
|
|
cmpq $0,%r13
|
|
jl .L_partial_incomplete_822
|
|
|
|
vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
|
|
vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
|
|
vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
|
|
vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm14,%xmm14
|
|
|
|
vpsrldq $8,%xmm14,%xmm11
|
|
vpslldq $8,%xmm14,%xmm14
|
|
vpxorq %xmm11,%xmm7,%xmm7
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm11
|
|
|
|
vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
|
|
vpslldq $8,%xmm10,%xmm10
|
|
vpxorq %xmm10,%xmm14,%xmm14
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
|
|
vpsrldq $4,%xmm10,%xmm10
|
|
vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
|
|
vpternlogq $0x96,%xmm10,%xmm7,%xmm14
|
|
|
|
movq $0,(%rdx)
|
|
|
|
movq %r11,%r12
|
|
movq $16,%r11
|
|
subq %r12,%r11
|
|
jmp .L_enc_dec_done_822
|
|
|
|
.L_partial_incomplete_822:
|
|
addq %r8,(%rdx)
|
|
movq %r8,%r11
|
|
|
|
.L_enc_dec_done_822:
|
|
|
|
|
|
leaq byte_len_to_mask_table(%rip),%r12
|
|
kmovw (%r12,%r11,2),%k1
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
movq %r9,%r12
|
|
vmovdqu8 %xmm3,(%r12){%k1}
|
|
.L_partial_block_done_822:
|
|
vmovdqu64 0(%rsi),%xmm2
|
|
subq %r11,%r8
|
|
je .L_enc_dec_done_821
|
|
cmpq $256,%r8
|
|
jbe .L_message_below_equal_16_blocks_821
|
|
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vmovdqa64 ddq_addbe_4444(%rip),%zmm27
|
|
vmovdqa64 ddq_addbe_1234(%rip),%zmm28
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vmovd %xmm2,%r15d
|
|
andl $255,%r15d
|
|
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_823
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_823
|
|
.L_next_16_overflow_823:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_823:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 208(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 224(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,192(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,768(%rsp)
|
|
vmovdqa64 %zmm10,832(%rsp)
|
|
vmovdqa64 %zmm11,896(%rsp)
|
|
vmovdqa64 %zmm12,960(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_824
|
|
|
|
vmovdqu64 288(%rsi),%zmm0
|
|
vmovdqu64 %zmm0,704(%rsp)
|
|
|
|
vmovdqu64 224(%rsi),%zmm3
|
|
vmovdqu64 %zmm3,640(%rsp)
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 160(%rsi),%zmm4
|
|
vmovdqu64 %zmm4,576(%rsp)
|
|
|
|
vmovdqu64 96(%rsi),%zmm5
|
|
vmovdqu64 %zmm5,512(%rsp)
|
|
.L_skip_hkeys_precomputation_824:
|
|
cmpq $512,%r8
|
|
jb .L_message_below_32_blocks_821
|
|
|
|
|
|
|
|
cmpb $240,%r15b
|
|
jae .L_next_16_overflow_825
|
|
vpaddd %zmm28,%zmm2,%zmm7
|
|
vpaddd %zmm27,%zmm7,%zmm10
|
|
vpaddd %zmm27,%zmm10,%zmm11
|
|
vpaddd %zmm27,%zmm11,%zmm12
|
|
jmp .L_next_16_ok_825
|
|
.L_next_16_overflow_825:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm12
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
|
|
vpaddd %zmm12,%zmm7,%zmm10
|
|
vpaddd %zmm12,%zmm10,%zmm11
|
|
vpaddd %zmm12,%zmm11,%zmm12
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vpshufb %zmm29,%zmm12,%zmm12
|
|
.L_next_16_ok_825:
|
|
vshufi64x2 $255,%zmm12,%zmm12,%zmm2
|
|
addb $16,%r15b
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm0
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm3
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm4
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm5
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm6
|
|
vpxorq %zmm6,%zmm7,%zmm7
|
|
vpxorq %zmm6,%zmm10,%zmm10
|
|
vpxorq %zmm6,%zmm11,%zmm11
|
|
vpxorq %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 16(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 32(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 48(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 64(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 80(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 96(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 112(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 128(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 144(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 160(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 176(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 192(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 208(%rdi),%zmm6
|
|
vaesenc %zmm6,%zmm7,%zmm7
|
|
vaesenc %zmm6,%zmm10,%zmm10
|
|
vaesenc %zmm6,%zmm11,%zmm11
|
|
vaesenc %zmm6,%zmm12,%zmm12
|
|
vbroadcastf64x2 224(%rdi),%zmm6
|
|
vaesenclast %zmm6,%zmm7,%zmm7
|
|
vaesenclast %zmm6,%zmm10,%zmm10
|
|
vaesenclast %zmm6,%zmm11,%zmm11
|
|
vaesenclast %zmm6,%zmm12,%zmm12
|
|
|
|
|
|
vpxorq %zmm0,%zmm7,%zmm7
|
|
vpxorq %zmm3,%zmm10,%zmm10
|
|
vpxorq %zmm4,%zmm11,%zmm11
|
|
vpxorq %zmm5,%zmm12,%zmm12
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm7,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm10,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm11,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm12,448(%r10,%r11,1)
|
|
|
|
vpshufb %zmm29,%zmm0,%zmm7
|
|
vpshufb %zmm29,%zmm3,%zmm10
|
|
vpshufb %zmm29,%zmm4,%zmm11
|
|
vpshufb %zmm29,%zmm5,%zmm12
|
|
vmovdqa64 %zmm7,1024(%rsp)
|
|
vmovdqa64 %zmm10,1088(%rsp)
|
|
vmovdqa64 %zmm11,1152(%rsp)
|
|
vmovdqa64 %zmm12,1216(%rsp)
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_826
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,192(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,128(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,64(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,0(%rsp)
|
|
.L_skip_hkeys_precomputation_826:
|
|
movq $1,%r14
|
|
addq $512,%r11
|
|
subq $512,%r8
|
|
|
|
cmpq $768,%r8
|
|
jb .L_no_more_big_nblocks_821
|
|
.L_encrypt_big_nblocks_821:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_827
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_827
|
|
.L_16_blocks_overflow_827:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_827:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_828
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_828
|
|
.L_16_blocks_overflow_828:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_828:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_829
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_829
|
|
.L_16_blocks_overflow_829:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_829:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 512(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 576(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 640(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 704(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
|
|
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpternlogq $0x96,%zmm15,%zmm12,%zmm6
|
|
vpxorq %zmm24,%zmm6,%zmm6
|
|
vpternlogq $0x96,%zmm10,%zmm13,%zmm7
|
|
vpxorq %zmm25,%zmm7,%zmm7
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vextracti64x4 $1,%zmm6,%ymm12
|
|
vpxorq %ymm12,%ymm6,%ymm6
|
|
vextracti32x4 $1,%ymm6,%xmm12
|
|
vpxorq %xmm12,%xmm6,%xmm6
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm6
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,512(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,576(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,640(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,704(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1024(%rsp)
|
|
vmovdqa64 %zmm3,1088(%rsp)
|
|
vmovdqa64 %zmm4,1152(%rsp)
|
|
vmovdqa64 %zmm5,1216(%rsp)
|
|
vmovdqa64 %zmm6,%zmm14
|
|
|
|
addq $768,%r11
|
|
subq $768,%r8
|
|
cmpq $768,%r8
|
|
jae .L_encrypt_big_nblocks_821
|
|
|
|
.L_no_more_big_nblocks_821:
|
|
|
|
cmpq $512,%r8
|
|
jae .L_encrypt_32_blocks_821
|
|
|
|
cmpq $256,%r8
|
|
jae .L_encrypt_16_blocks_821
|
|
.L_encrypt_0_blocks_ghash_32_821:
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $256,%ebx
|
|
subl %r10d,%ebx
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
addl $256,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_830
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_830
|
|
jb .L_last_num_blocks_is_7_1_830
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_830
|
|
jb .L_last_num_blocks_is_11_9_830
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_830
|
|
ja .L_last_num_blocks_is_16_830
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_830
|
|
jmp .L_last_num_blocks_is_13_830
|
|
|
|
.L_last_num_blocks_is_11_9_830:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_830
|
|
ja .L_last_num_blocks_is_11_830
|
|
jmp .L_last_num_blocks_is_9_830
|
|
|
|
.L_last_num_blocks_is_7_1_830:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_830
|
|
jb .L_last_num_blocks_is_3_1_830
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_830
|
|
je .L_last_num_blocks_is_6_830
|
|
jmp .L_last_num_blocks_is_5_830
|
|
|
|
.L_last_num_blocks_is_3_1_830:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_830
|
|
je .L_last_num_blocks_is_2_830
|
|
.L_last_num_blocks_is_1_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_831
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_831
|
|
|
|
.L_16_blocks_overflow_831:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_831:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_832
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_832
|
|
.L_small_initial_partial_block_832:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_832
|
|
.L_small_initial_compute_done_832:
|
|
.L_after_reduction_832:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_2_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_833
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_833
|
|
|
|
.L_16_blocks_overflow_833:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_833:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_834
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_834
|
|
.L_small_initial_partial_block_834:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_834:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_834
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_834:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_3_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_835
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_835
|
|
|
|
.L_16_blocks_overflow_835:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_835:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_836
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_836
|
|
.L_small_initial_partial_block_836:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_836:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_836
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_836:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_4_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_837
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_837
|
|
|
|
.L_16_blocks_overflow_837:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_837:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_838
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_838
|
|
.L_small_initial_partial_block_838:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_838:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_838
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_838:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_5_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_839
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_839
|
|
|
|
.L_16_blocks_overflow_839:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_839:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_840
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_840
|
|
.L_small_initial_partial_block_840:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_840:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_840
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_840:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_6_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_841
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_841
|
|
|
|
.L_16_blocks_overflow_841:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_841:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_842
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_842
|
|
.L_small_initial_partial_block_842:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_842:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_842
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_842:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_7_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_843
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_843
|
|
|
|
.L_16_blocks_overflow_843:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_843:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_844
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_844
|
|
.L_small_initial_partial_block_844:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_844:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_844
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_844:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_8_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_845
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_845
|
|
|
|
.L_16_blocks_overflow_845:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_845:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_846
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_846
|
|
.L_small_initial_partial_block_846:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_846:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_846
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_846:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_9_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_847
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_847
|
|
|
|
.L_16_blocks_overflow_847:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_847:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_848
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_848
|
|
.L_small_initial_partial_block_848:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_848:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_848
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_848:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_10_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_849
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_849
|
|
|
|
.L_16_blocks_overflow_849:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_849:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_850
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_850
|
|
.L_small_initial_partial_block_850:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_850:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_850
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_850:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_11_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_851
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_851
|
|
|
|
.L_16_blocks_overflow_851:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_851:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_852
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_852
|
|
.L_small_initial_partial_block_852:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_852:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_852
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_852:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_12_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_853
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_853
|
|
|
|
.L_16_blocks_overflow_853:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_853:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_854
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_854
|
|
.L_small_initial_partial_block_854:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_854:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_854
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_854:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_13_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_855
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_855
|
|
|
|
.L_16_blocks_overflow_855:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_855:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_856
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_856
|
|
.L_small_initial_partial_block_856:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_856:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_856
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_856:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_14_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_857
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_857
|
|
|
|
.L_16_blocks_overflow_857:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_857:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_858
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_858
|
|
.L_small_initial_partial_block_858:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_858:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_858
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_858:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_15_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_859
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_859
|
|
|
|
.L_16_blocks_overflow_859:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_859:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_860
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_860
|
|
.L_small_initial_partial_block_860:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_860:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_860
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_860:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_16_830:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_861
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_861
|
|
|
|
.L_16_blocks_overflow_861:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_861:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm14,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_862:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_862:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_862:
|
|
jmp .L_last_blocks_done_830
|
|
.L_last_num_blocks_is_0_830:
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_830:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_821
|
|
.L_encrypt_32_blocks_821:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_863
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_863
|
|
.L_16_blocks_overflow_863:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_863:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_864
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_864
|
|
.L_16_blocks_overflow_864:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_864:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1024(%rsp),%zmm8
|
|
vmovdqu64 256(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 320(%rsp),%zmm18
|
|
vmovdqa64 1088(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 384(%rsp),%zmm1
|
|
vmovdqa64 1152(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 448(%rsp),%zmm18
|
|
vmovdqa64 1216(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 256(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 320(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 384(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 448(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm15,%zmm10,%zmm26
|
|
vpternlogq $0x96,%zmm12,%zmm6,%zmm24
|
|
vpternlogq $0x96,%zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,256(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,320(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,384(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,448(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,768(%rsp)
|
|
vmovdqa64 %zmm3,832(%rsp)
|
|
vmovdqa64 %zmm4,896(%rsp)
|
|
vmovdqa64 %zmm5,960(%rsp)
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
subq $512,%r8
|
|
addq $512,%r11
|
|
movl %r8d,%r10d
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_865
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_865
|
|
jb .L_last_num_blocks_is_7_1_865
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_865
|
|
jb .L_last_num_blocks_is_11_9_865
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_865
|
|
ja .L_last_num_blocks_is_16_865
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_865
|
|
jmp .L_last_num_blocks_is_13_865
|
|
|
|
.L_last_num_blocks_is_11_9_865:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_865
|
|
ja .L_last_num_blocks_is_11_865
|
|
jmp .L_last_num_blocks_is_9_865
|
|
|
|
.L_last_num_blocks_is_7_1_865:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_865
|
|
jb .L_last_num_blocks_is_3_1_865
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_865
|
|
je .L_last_num_blocks_is_6_865
|
|
jmp .L_last_num_blocks_is_5_865
|
|
|
|
.L_last_num_blocks_is_3_1_865:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_865
|
|
je .L_last_num_blocks_is_2_865
|
|
.L_last_num_blocks_is_1_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_866
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_866
|
|
|
|
.L_16_blocks_overflow_866:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_866:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_867
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_867
|
|
.L_small_initial_partial_block_867:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_867
|
|
.L_small_initial_compute_done_867:
|
|
.L_after_reduction_867:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_2_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_868
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_868
|
|
|
|
.L_16_blocks_overflow_868:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_868:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_869
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_869
|
|
.L_small_initial_partial_block_869:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_869:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_869
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_869:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_3_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_870
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_870
|
|
|
|
.L_16_blocks_overflow_870:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_870:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_871
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_871
|
|
.L_small_initial_partial_block_871:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_871:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_871
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_871:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_4_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_872
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_872
|
|
|
|
.L_16_blocks_overflow_872:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_872:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_873
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_873
|
|
.L_small_initial_partial_block_873:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_873:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_873
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_873:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_5_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_874
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_874
|
|
|
|
.L_16_blocks_overflow_874:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_874:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_875
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_875
|
|
.L_small_initial_partial_block_875:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_875:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_875
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_875:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_6_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_876
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_876
|
|
|
|
.L_16_blocks_overflow_876:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_876:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_877
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_877
|
|
.L_small_initial_partial_block_877:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_877:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_877
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_877:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_7_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_878
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_878
|
|
|
|
.L_16_blocks_overflow_878:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_878:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_879
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_879
|
|
.L_small_initial_partial_block_879:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_879:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_879
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_879:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_8_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_880
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_880
|
|
|
|
.L_16_blocks_overflow_880:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_880:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_881
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_881
|
|
.L_small_initial_partial_block_881:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_881:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_881
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_881:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_9_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_882
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_882
|
|
|
|
.L_16_blocks_overflow_882:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_882:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_883
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_883
|
|
.L_small_initial_partial_block_883:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_883:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_883
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_883:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_10_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_884
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_884
|
|
|
|
.L_16_blocks_overflow_884:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_884:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_885
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_885
|
|
.L_small_initial_partial_block_885:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_885:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_885
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_885:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_11_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_886
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_886
|
|
|
|
.L_16_blocks_overflow_886:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_886:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_887
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_887
|
|
.L_small_initial_partial_block_887:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_887:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_887
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_887:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_12_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_888
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_888
|
|
|
|
.L_16_blocks_overflow_888:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_888:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_889
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_889
|
|
.L_small_initial_partial_block_889:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_889:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_889
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_889:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_13_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_890
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_890
|
|
|
|
.L_16_blocks_overflow_890:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_890:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_891
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_891
|
|
.L_small_initial_partial_block_891:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_891:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_891
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_891:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_14_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_892
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_892
|
|
|
|
.L_16_blocks_overflow_892:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_892:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_893
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_893
|
|
.L_small_initial_partial_block_893:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_893:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_893
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_893:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_15_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_894
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_894
|
|
|
|
.L_16_blocks_overflow_894:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_894:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_895
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_895
|
|
.L_small_initial_partial_block_895:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_895:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_895
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_895:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_16_865:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_896
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_896
|
|
|
|
.L_16_blocks_overflow_896:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_896:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_897:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_897:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_897:
|
|
jmp .L_last_blocks_done_865
|
|
.L_last_num_blocks_is_0_865:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_865:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_821
|
|
.L_encrypt_16_blocks_821:
|
|
cmpb $240,%r15b
|
|
jae .L_16_blocks_overflow_898
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_898
|
|
.L_16_blocks_overflow_898:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_898:
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp),%zmm1
|
|
|
|
|
|
|
|
|
|
vshufi64x2 $255,%zmm5,%zmm5,%zmm2
|
|
addb $16,%r15b
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm6
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
|
|
|
|
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21
|
|
|
|
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vpxorq %zmm12,%zmm6,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
|
|
|
|
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1)
|
|
vpshufb %zmm29,%zmm17,%zmm0
|
|
vpshufb %zmm29,%zmm19,%zmm3
|
|
vpshufb %zmm29,%zmm20,%zmm4
|
|
vpshufb %zmm29,%zmm21,%zmm5
|
|
vmovdqa64 %zmm0,1280(%rsp)
|
|
vmovdqa64 %zmm3,1344(%rsp)
|
|
vmovdqa64 %zmm4,1408(%rsp)
|
|
vmovdqa64 %zmm5,1472(%rsp)
|
|
vmovdqa64 1024(%rsp),%zmm13
|
|
vmovdqu64 256(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1088(%rsp),%zmm13
|
|
vmovdqu64 320(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1152(%rsp),%zmm13
|
|
vmovdqu64 384(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1216(%rsp),%zmm13
|
|
vmovdqu64 448(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_899
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_899
|
|
jb .L_last_num_blocks_is_7_1_899
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_899
|
|
jb .L_last_num_blocks_is_11_9_899
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_899
|
|
ja .L_last_num_blocks_is_16_899
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_899
|
|
jmp .L_last_num_blocks_is_13_899
|
|
|
|
.L_last_num_blocks_is_11_9_899:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_899
|
|
ja .L_last_num_blocks_is_11_899
|
|
jmp .L_last_num_blocks_is_9_899
|
|
|
|
.L_last_num_blocks_is_7_1_899:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_899
|
|
jb .L_last_num_blocks_is_3_1_899
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_899
|
|
je .L_last_num_blocks_is_6_899
|
|
jmp .L_last_num_blocks_is_5_899
|
|
|
|
.L_last_num_blocks_is_3_1_899:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_899
|
|
je .L_last_num_blocks_is_2_899
|
|
.L_last_num_blocks_is_1_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_900
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_900
|
|
|
|
.L_16_blocks_overflow_900:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_900:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_901
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_901
|
|
.L_small_initial_partial_block_901:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_901
|
|
.L_small_initial_compute_done_901:
|
|
.L_after_reduction_901:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_2_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_902
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_902
|
|
|
|
.L_16_blocks_overflow_902:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_902:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_903
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_903
|
|
.L_small_initial_partial_block_903:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_903:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_903
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_903:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_3_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_904
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_904
|
|
|
|
.L_16_blocks_overflow_904:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_904:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_905
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_905
|
|
.L_small_initial_partial_block_905:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_905:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_905
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_905:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_4_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_906
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_906
|
|
|
|
.L_16_blocks_overflow_906:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_906:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_907
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_907
|
|
.L_small_initial_partial_block_907:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_907:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_907
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_907:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_5_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_908
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_908
|
|
|
|
.L_16_blocks_overflow_908:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_908:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_909
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_909
|
|
.L_small_initial_partial_block_909:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_909:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_909
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_909:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_6_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_910
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_910
|
|
|
|
.L_16_blocks_overflow_910:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_910:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_911
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_911
|
|
.L_small_initial_partial_block_911:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_911:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_911
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_911:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_7_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_912
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_912
|
|
|
|
.L_16_blocks_overflow_912:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_912:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_913
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_913
|
|
.L_small_initial_partial_block_913:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_913:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_913
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_913:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_8_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_914
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_914
|
|
|
|
.L_16_blocks_overflow_914:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_914:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_915
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_915
|
|
.L_small_initial_partial_block_915:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_915:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_915
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_915:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_9_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_916
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_916
|
|
|
|
.L_16_blocks_overflow_916:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_916:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_917
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_917
|
|
.L_small_initial_partial_block_917:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_917:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_917
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_917:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_10_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_918
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_918
|
|
|
|
.L_16_blocks_overflow_918:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_918:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_919
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_919
|
|
.L_small_initial_partial_block_919:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_919:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_919
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_919:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_11_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_920
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_920
|
|
|
|
.L_16_blocks_overflow_920:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_920:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_921
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_921
|
|
.L_small_initial_partial_block_921:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_921:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_921
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_921:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_12_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_922
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_922
|
|
|
|
.L_16_blocks_overflow_922:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_922:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_923
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_923
|
|
.L_small_initial_partial_block_923:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_923:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_923
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_923:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_13_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_924
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_924
|
|
|
|
.L_16_blocks_overflow_924:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_924:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_925
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_925
|
|
.L_small_initial_partial_block_925:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_925:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_925
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_925:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_14_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_926
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_926
|
|
|
|
.L_16_blocks_overflow_926:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_926:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_927
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_927
|
|
.L_small_initial_partial_block_927:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_927:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_927
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_927:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_15_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_928
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_928
|
|
|
|
.L_16_blocks_overflow_928:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_928:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_929
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_929
|
|
.L_small_initial_partial_block_929:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_929:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_929
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_929:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_16_899:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_930
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_930
|
|
|
|
.L_16_blocks_overflow_930:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_930:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vmovdqa64 1280(%rsp),%zmm8
|
|
vmovdqu64 512(%rsp),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 576(%rsp),%zmm18
|
|
vmovdqa64 1344(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 640(%rsp),%zmm1
|
|
vmovdqa64 1408(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 704(%rsp),%zmm18
|
|
vmovdqa64 1472(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpternlogq $0x96,%zmm12,%zmm24,%zmm14
|
|
vpternlogq $0x96,%zmm13,%zmm25,%zmm7
|
|
vpternlogq $0x96,%zmm15,%zmm26,%zmm10
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vpsrldq $8,%zmm10,%zmm15
|
|
vpslldq $8,%zmm10,%zmm10
|
|
|
|
vmovdqa64 POLY2(%rip),%xmm16
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vpxorq %zmm15,%zmm14,%zmm14
|
|
vpxorq %zmm10,%zmm7,%zmm7
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vextracti64x4 $1,%zmm14,%ymm12
|
|
vpxorq %ymm12,%ymm14,%ymm14
|
|
vextracti32x4 $1,%ymm14,%xmm12
|
|
vpxorq %xmm12,%xmm14,%xmm14
|
|
vextracti64x4 $1,%zmm7,%ymm13
|
|
vpxorq %ymm13,%ymm7,%ymm7
|
|
vextracti32x4 $1,%ymm7,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm7
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
|
|
vpslldq $8,%xmm13,%xmm13
|
|
vpxorq %xmm13,%xmm7,%xmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
|
|
vpsrldq $4,%xmm12,%xmm12
|
|
vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
|
|
vpslldq $4,%xmm15,%xmm15
|
|
|
|
vpternlogq $0x96,%xmm12,%xmm15,%xmm14
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_931:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vpxorq %zmm14,%zmm17,%zmm17
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm31,%zmm5,%zmm5
|
|
vpxorq %zmm8,%zmm0,%zmm0
|
|
vpxorq %zmm22,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_931:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_931:
|
|
jmp .L_last_blocks_done_899
|
|
.L_last_num_blocks_is_0_899:
|
|
vmovdqa64 1280(%rsp),%zmm13
|
|
vmovdqu64 512(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1344(%rsp),%zmm13
|
|
vmovdqu64 576(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 1408(%rsp),%zmm13
|
|
vmovdqu64 640(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 1472(%rsp),%zmm13
|
|
vmovdqu64 704(%rsp),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_899:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_821
|
|
|
|
.L_message_below_32_blocks_821:
|
|
|
|
|
|
subq $256,%r8
|
|
addq $256,%r11
|
|
movl %r8d,%r10d
|
|
testq %r14,%r14
|
|
jnz .L_skip_hkeys_precomputation_932
|
|
vmovdqu64 640(%rsp),%zmm3
|
|
|
|
|
|
vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
|
|
|
|
vmovdqu64 576(%rsp),%zmm4
|
|
vmovdqu64 512(%rsp),%zmm5
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,448(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,384(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
|
|
vpsrldq $8,%zmm4,%zmm10
|
|
vpslldq $8,%zmm4,%zmm4
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm4,%zmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
|
|
vpslldq $4,%zmm4,%zmm4
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm4
|
|
|
|
vmovdqu64 %zmm4,320(%rsp)
|
|
|
|
vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
|
|
vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
|
|
vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
|
|
vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm5,%zmm5
|
|
|
|
vpsrldq $8,%zmm5,%zmm10
|
|
vpslldq $8,%zmm5,%zmm5
|
|
vpxorq %zmm10,%zmm6,%zmm6
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%zmm10
|
|
|
|
vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
|
|
vpslldq $8,%zmm7,%zmm7
|
|
vpxorq %zmm7,%zmm5,%zmm5
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
|
|
vpsrldq $4,%zmm7,%zmm7
|
|
vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
|
|
vpslldq $4,%zmm5,%zmm5
|
|
|
|
vpternlogq $0x96,%zmm7,%zmm6,%zmm5
|
|
|
|
vmovdqu64 %zmm5,256(%rsp)
|
|
.L_skip_hkeys_precomputation_932:
|
|
movq $1,%r14
|
|
andl $~15,%r10d
|
|
movl $512,%ebx
|
|
subl %r10d,%ebx
|
|
movl %r8d,%r10d
|
|
addl $15,%r10d
|
|
shrl $4,%r10d
|
|
je .L_last_num_blocks_is_0_933
|
|
|
|
cmpl $8,%r10d
|
|
je .L_last_num_blocks_is_8_933
|
|
jb .L_last_num_blocks_is_7_1_933
|
|
|
|
|
|
cmpl $12,%r10d
|
|
je .L_last_num_blocks_is_12_933
|
|
jb .L_last_num_blocks_is_11_9_933
|
|
|
|
|
|
cmpl $15,%r10d
|
|
je .L_last_num_blocks_is_15_933
|
|
ja .L_last_num_blocks_is_16_933
|
|
cmpl $14,%r10d
|
|
je .L_last_num_blocks_is_14_933
|
|
jmp .L_last_num_blocks_is_13_933
|
|
|
|
.L_last_num_blocks_is_11_9_933:
|
|
|
|
cmpl $10,%r10d
|
|
je .L_last_num_blocks_is_10_933
|
|
ja .L_last_num_blocks_is_11_933
|
|
jmp .L_last_num_blocks_is_9_933
|
|
|
|
.L_last_num_blocks_is_7_1_933:
|
|
cmpl $4,%r10d
|
|
je .L_last_num_blocks_is_4_933
|
|
jb .L_last_num_blocks_is_3_1_933
|
|
|
|
cmpl $6,%r10d
|
|
ja .L_last_num_blocks_is_7_933
|
|
je .L_last_num_blocks_is_6_933
|
|
jmp .L_last_num_blocks_is_5_933
|
|
|
|
.L_last_num_blocks_is_3_1_933:
|
|
|
|
cmpl $2,%r10d
|
|
ja .L_last_num_blocks_is_3_933
|
|
je .L_last_num_blocks_is_2_933
|
|
.L_last_num_blocks_is_1_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $255,%r15d
|
|
jae .L_16_blocks_overflow_934
|
|
vpaddd %xmm28,%xmm2,%xmm0
|
|
jmp .L_16_blocks_ok_934
|
|
|
|
.L_16_blocks_overflow_934:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
.L_16_blocks_ok_934:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %xmm30,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %xmm31,%xmm0,%xmm0
|
|
vaesenclast %xmm30,%xmm0,%xmm0
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %xmm29,%xmm17,%xmm17
|
|
vextracti32x4 $0,%zmm17,%xmm7
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_935
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_935
|
|
.L_small_initial_partial_block_935:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm0
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
|
|
vpslldq $8,%xmm3,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm3
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm4,%xmm14
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_935
|
|
.L_small_initial_compute_done_935:
|
|
.L_after_reduction_935:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_2_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $254,%r15d
|
|
jae .L_16_blocks_overflow_936
|
|
vpaddd %ymm28,%ymm2,%ymm0
|
|
jmp .L_16_blocks_ok_936
|
|
|
|
.L_16_blocks_overflow_936:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
.L_16_blocks_ok_936:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %ymm30,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %ymm31,%ymm0,%ymm0
|
|
vaesenclast %ymm30,%ymm0,%ymm0
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %ymm29,%ymm17,%ymm17
|
|
vextracti32x4 $1,%zmm17,%xmm7
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_937
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_937
|
|
.L_small_initial_partial_block_937:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_937:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_937
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_937:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_3_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $253,%r15d
|
|
jae .L_16_blocks_overflow_938
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_938
|
|
|
|
.L_16_blocks_overflow_938:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_938:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $2,%zmm17,%xmm7
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_939
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_939
|
|
.L_small_initial_partial_block_939:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_939:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_939
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_939:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_4_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $252,%r15d
|
|
jae .L_16_blocks_overflow_940
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
jmp .L_16_blocks_ok_940
|
|
|
|
.L_16_blocks_overflow_940:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
.L_16_blocks_ok_940:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm17,%zmm17{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vextracti32x4 $3,%zmm17,%xmm7
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_941
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_941
|
|
.L_small_initial_partial_block_941:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpxorq %zmm26,%zmm4,%zmm4
|
|
vpxorq %zmm24,%zmm0,%zmm0
|
|
vpxorq %zmm25,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_941:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_941
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_941:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_5_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $251,%r15d
|
|
jae .L_16_blocks_overflow_942
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %xmm27,%xmm0,%xmm3
|
|
jmp .L_16_blocks_ok_942
|
|
|
|
.L_16_blocks_overflow_942:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
.L_16_blocks_ok_942:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %xmm30,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %xmm31,%xmm3,%xmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %xmm30,%xmm3,%xmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %xmm29,%xmm19,%xmm19
|
|
vextracti32x4 $0,%zmm19,%xmm7
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_943
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_943
|
|
.L_small_initial_partial_block_943:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_943:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_943
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_943:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_6_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $250,%r15d
|
|
jae .L_16_blocks_overflow_944
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %ymm27,%ymm0,%ymm3
|
|
jmp .L_16_blocks_ok_944
|
|
|
|
.L_16_blocks_overflow_944:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
.L_16_blocks_ok_944:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %ymm30,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %ymm31,%ymm3,%ymm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %ymm30,%ymm3,%ymm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %ymm29,%ymm19,%ymm19
|
|
vextracti32x4 $1,%zmm19,%xmm7
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_945
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_945
|
|
.L_small_initial_partial_block_945:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_945:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_945
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_945:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_7_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $249,%r15d
|
|
jae .L_16_blocks_overflow_946
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_946
|
|
|
|
.L_16_blocks_overflow_946:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_946:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $2,%zmm19,%xmm7
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_947
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_947
|
|
.L_small_initial_partial_block_947:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_947:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_947
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_947:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_8_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $64,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $248,%r15d
|
|
jae .L_16_blocks_overflow_948
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
jmp .L_16_blocks_ok_948
|
|
|
|
.L_16_blocks_overflow_948:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
.L_16_blocks_ok_948:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm19,%zmm19{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vextracti32x4 $3,%zmm19,%xmm7
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_949
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_949
|
|
.L_small_initial_partial_block_949:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_949:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_949
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_949:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_9_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $247,%r15d
|
|
jae .L_16_blocks_overflow_950
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %xmm27,%xmm3,%xmm4
|
|
jmp .L_16_blocks_ok_950
|
|
|
|
.L_16_blocks_overflow_950:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
.L_16_blocks_ok_950:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %xmm30,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %xmm31,%xmm4,%xmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %xmm30,%xmm4,%xmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %xmm20,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %xmm29,%xmm20,%xmm20
|
|
vextracti32x4 $0,%zmm20,%xmm7
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_951
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_951
|
|
.L_small_initial_partial_block_951:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_951:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_951
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_951:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_10_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $246,%r15d
|
|
jae .L_16_blocks_overflow_952
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %ymm27,%ymm3,%ymm4
|
|
jmp .L_16_blocks_ok_952
|
|
|
|
.L_16_blocks_overflow_952:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
.L_16_blocks_ok_952:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %ymm30,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %ymm31,%ymm4,%ymm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %ymm30,%ymm4,%ymm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %ymm20,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %ymm29,%ymm20,%ymm20
|
|
vextracti32x4 $1,%zmm20,%xmm7
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_953
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_953
|
|
.L_small_initial_partial_block_953:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_953:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_953
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_953:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_11_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $245,%r15d
|
|
jae .L_16_blocks_overflow_954
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_954
|
|
|
|
.L_16_blocks_overflow_954:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_954:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $2,%zmm20,%xmm7
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_955
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_955
|
|
.L_small_initial_partial_block_955:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_955:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_955
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_955:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_12_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $128,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $244,%r15d
|
|
jae .L_16_blocks_overflow_956
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
jmp .L_16_blocks_ok_956
|
|
|
|
.L_16_blocks_overflow_956:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
.L_16_blocks_ok_956:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm20,%zmm20{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vextracti32x4 $3,%zmm20,%xmm7
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_957
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_957
|
|
.L_small_initial_partial_block_957:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vpxorq %zmm8,%zmm0,%zmm8
|
|
vpxorq %zmm22,%zmm3,%zmm22
|
|
vpxorq %zmm30,%zmm4,%zmm30
|
|
vpxorq %zmm31,%zmm5,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_957:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_957
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_957:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_13_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $243,%r15d
|
|
jae .L_16_blocks_overflow_958
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %xmm27,%xmm4,%xmm5
|
|
jmp .L_16_blocks_ok_958
|
|
|
|
.L_16_blocks_overflow_958:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
.L_16_blocks_ok_958:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %xmm30,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %xmm31,%xmm5,%xmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %xmm30,%xmm5,%xmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %xmm21,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %xmm29,%xmm21,%xmm21
|
|
vextracti32x4 $0,%zmm21,%xmm7
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_959
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_959
|
|
.L_small_initial_partial_block_959:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 160(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 288(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
|
|
vpxorq %zmm26,%zmm30,%zmm30
|
|
vpxorq %zmm24,%zmm8,%zmm8
|
|
vpxorq %zmm25,%zmm22,%zmm22
|
|
|
|
vpxorq %zmm31,%zmm30,%zmm30
|
|
vpsrldq $8,%zmm30,%zmm4
|
|
vpslldq $8,%zmm30,%zmm5
|
|
vpxorq %zmm4,%zmm8,%zmm0
|
|
vpxorq %zmm5,%zmm22,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_959:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_959
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_959:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_14_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $242,%r15d
|
|
jae .L_16_blocks_overflow_960
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %ymm27,%ymm4,%ymm5
|
|
jmp .L_16_blocks_ok_960
|
|
|
|
.L_16_blocks_overflow_960:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
.L_16_blocks_ok_960:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %ymm30,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %ymm31,%ymm5,%ymm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %ymm30,%ymm5,%ymm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %ymm21,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %ymm29,%ymm21,%ymm21
|
|
vextracti32x4 $1,%zmm21,%xmm7
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_961
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_961
|
|
.L_small_initial_partial_block_961:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 144(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 272(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 336(%rsi),%xmm1
|
|
vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
|
|
vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
|
|
vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_961:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_961
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_961:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_15_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $241,%r15d
|
|
jae .L_16_blocks_overflow_962
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_962
|
|
|
|
.L_16_blocks_overflow_962:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_962:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $2,%zmm21,%xmm7
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_963
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_963
|
|
.L_small_initial_partial_block_963:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 128(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 256(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 320(%rsi),%ymm1
|
|
vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
|
|
vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
|
|
vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
|
|
vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_963:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_963
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_963:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_16_933:
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%rax
|
|
subq $192,%rax
|
|
kmovq (%r10,%rax,8),%k1
|
|
cmpl $240,%r15d
|
|
jae .L_16_blocks_overflow_964
|
|
vpaddd %zmm28,%zmm2,%zmm0
|
|
vpaddd %zmm27,%zmm0,%zmm3
|
|
vpaddd %zmm27,%zmm3,%zmm4
|
|
vpaddd %zmm27,%zmm4,%zmm5
|
|
jmp .L_16_blocks_ok_964
|
|
|
|
.L_16_blocks_overflow_964:
|
|
vpshufb %zmm29,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vmovdqa64 ddq_add_4444(%rip),%zmm5
|
|
vpaddd %zmm5,%zmm0,%zmm3
|
|
vpaddd %zmm5,%zmm3,%zmm4
|
|
vpaddd %zmm5,%zmm4,%zmm5
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
.L_16_blocks_ok_964:
|
|
|
|
|
|
|
|
|
|
vbroadcastf64x2 0(%rdi),%zmm30
|
|
vpxorq 768(%rsp),%zmm14,%zmm8
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
|
|
|
|
vbroadcastf64x2 16(%rdi),%zmm31
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 832(%rsp),%zmm22
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm30,%zmm3,%zmm3
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpxorq %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm1
|
|
vmovdqa64 896(%rsp),%zmm8
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm18
|
|
vmovdqa64 960(%rsp),%zmm22
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm30
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
|
|
vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
|
|
vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm31
|
|
|
|
|
|
vpternlogq $0x96,%zmm17,%zmm12,%zmm14
|
|
vpternlogq $0x96,%zmm19,%zmm13,%zmm7
|
|
vpternlogq $0x96,%zmm21,%zmm16,%zmm11
|
|
vpternlogq $0x96,%zmm20,%zmm15,%zmm10
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm30
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm17
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm19
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm20
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm31
|
|
|
|
|
|
vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
|
|
vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
|
|
vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
|
|
vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm30
|
|
vpternlogq $0x96,%zmm16,%zmm11,%zmm10
|
|
vpxorq %zmm12,%zmm14,%zmm24
|
|
vpxorq %zmm13,%zmm7,%zmm25
|
|
vpxorq %zmm15,%zmm10,%zmm26
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm31
|
|
vaesenc %zmm30,%zmm0,%zmm0
|
|
vaesenc %zmm30,%zmm3,%zmm3
|
|
vaesenc %zmm30,%zmm4,%zmm4
|
|
vaesenc %zmm30,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm30
|
|
vaesenc %zmm31,%zmm0,%zmm0
|
|
vaesenc %zmm31,%zmm3,%zmm3
|
|
vaesenc %zmm31,%zmm4,%zmm4
|
|
vaesenc %zmm31,%zmm5,%zmm5
|
|
vaesenclast %zmm30,%zmm0,%zmm0
|
|
vaesenclast %zmm30,%zmm3,%zmm3
|
|
vaesenclast %zmm30,%zmm4,%zmm4
|
|
vaesenclast %zmm30,%zmm5,%zmm5
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vpxorq %zmm20,%zmm4,%zmm4
|
|
vpxorq %zmm21,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm11
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm21,%zmm21{%k1}{z}
|
|
vpshufb %zmm29,%zmm17,%zmm17
|
|
vpshufb %zmm29,%zmm19,%zmm19
|
|
vpshufb %zmm29,%zmm20,%zmm20
|
|
vpshufb %zmm29,%zmm21,%zmm21
|
|
vextracti32x4 $3,%zmm21,%xmm7
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_965:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm11,16(%rsi)
|
|
vmovdqu64 112(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
|
|
vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
|
|
vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
|
|
vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
|
|
vmovdqu64 240(%rsi),%zmm1
|
|
vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm0,%zmm17,%zmm8
|
|
vpternlogq $0x96,%zmm3,%zmm19,%zmm22
|
|
vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
|
|
vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
|
|
vpternlogq $0x96,%zmm4,%zmm17,%zmm30
|
|
vpternlogq $0x96,%zmm5,%zmm19,%zmm31
|
|
vmovdqu64 304(%rsi),%ymm1
|
|
vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
|
|
vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
|
|
vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
|
|
vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
|
|
vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
|
|
|
|
vpxorq %zmm30,%zmm4,%zmm4
|
|
vpternlogq $0x96,%zmm31,%zmm26,%zmm5
|
|
vpternlogq $0x96,%zmm8,%zmm24,%zmm0
|
|
vpternlogq $0x96,%zmm22,%zmm25,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm30
|
|
vpslldq $8,%zmm4,%zmm31
|
|
vpxorq %zmm30,%zmm0,%zmm0
|
|
vpxorq %zmm31,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm30
|
|
vpxorq %ymm30,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm30
|
|
vpxorq %xmm30,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm31
|
|
vpxorq %ymm31,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm31
|
|
vpxorq %xmm31,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm1
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_965:
|
|
vpxorq %xmm7,%xmm14,%xmm14
|
|
.L_after_reduction_965:
|
|
jmp .L_last_blocks_done_933
|
|
.L_last_num_blocks_is_0_933:
|
|
vmovdqa64 768(%rsp),%zmm13
|
|
vpxorq %zmm14,%zmm13,%zmm13
|
|
vmovdqu64 0(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 832(%rsp),%zmm13
|
|
vmovdqu64 64(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
vpxorq %zmm10,%zmm4,%zmm26
|
|
vpxorq %zmm6,%zmm0,%zmm24
|
|
vpxorq %zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
vmovdqa64 896(%rsp),%zmm13
|
|
vmovdqu64 128(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
|
|
vmovdqa64 960(%rsp),%zmm13
|
|
vmovdqu64 192(%rsp,%rbx,1),%zmm12
|
|
vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
|
|
vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
|
|
vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
|
|
vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
|
|
|
|
vpternlogq $0x96,%zmm10,%zmm4,%zmm26
|
|
vpternlogq $0x96,%zmm6,%zmm0,%zmm24
|
|
vpternlogq $0x96,%zmm7,%zmm3,%zmm25
|
|
vpternlogq $0x96,%zmm11,%zmm5,%zmm26
|
|
|
|
vpsrldq $8,%zmm26,%zmm0
|
|
vpslldq $8,%zmm26,%zmm3
|
|
vpxorq %zmm0,%zmm24,%zmm24
|
|
vpxorq %zmm3,%zmm25,%zmm25
|
|
vextracti64x4 $1,%zmm24,%ymm0
|
|
vpxorq %ymm0,%ymm24,%ymm24
|
|
vextracti32x4 $1,%ymm24,%xmm0
|
|
vpxorq %xmm0,%xmm24,%xmm24
|
|
vextracti64x4 $1,%zmm25,%ymm3
|
|
vpxorq %ymm3,%ymm25,%ymm25
|
|
vextracti32x4 $1,%ymm25,%xmm3
|
|
vpxorq %xmm3,%xmm25,%xmm25
|
|
vmovdqa64 POLY2(%rip),%xmm4
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
|
|
vpslldq $8,%xmm0,%xmm0
|
|
vpxorq %xmm0,%xmm25,%xmm0
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
|
|
vpsrldq $4,%xmm3,%xmm3
|
|
vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm24,%xmm3,%xmm14
|
|
|
|
.L_last_blocks_done_933:
|
|
vpshufb %xmm29,%xmm2,%xmm2
|
|
jmp .L_ghash_done_821
|
|
|
|
.L_message_below_equal_16_blocks_821:
|
|
|
|
|
|
movl %r8d,%r12d
|
|
addl $15,%r12d
|
|
shrl $4,%r12d
|
|
cmpq $8,%r12
|
|
je .L_small_initial_num_blocks_is_8_966
|
|
jl .L_small_initial_num_blocks_is_7_1_966
|
|
|
|
|
|
cmpq $12,%r12
|
|
je .L_small_initial_num_blocks_is_12_966
|
|
jl .L_small_initial_num_blocks_is_11_9_966
|
|
|
|
|
|
cmpq $16,%r12
|
|
je .L_small_initial_num_blocks_is_16_966
|
|
cmpq $15,%r12
|
|
je .L_small_initial_num_blocks_is_15_966
|
|
cmpq $14,%r12
|
|
je .L_small_initial_num_blocks_is_14_966
|
|
jmp .L_small_initial_num_blocks_is_13_966
|
|
|
|
.L_small_initial_num_blocks_is_11_9_966:
|
|
|
|
cmpq $11,%r12
|
|
je .L_small_initial_num_blocks_is_11_966
|
|
cmpq $10,%r12
|
|
je .L_small_initial_num_blocks_is_10_966
|
|
jmp .L_small_initial_num_blocks_is_9_966
|
|
|
|
.L_small_initial_num_blocks_is_7_1_966:
|
|
cmpq $4,%r12
|
|
je .L_small_initial_num_blocks_is_4_966
|
|
jl .L_small_initial_num_blocks_is_3_1_966
|
|
|
|
cmpq $7,%r12
|
|
je .L_small_initial_num_blocks_is_7_966
|
|
cmpq $6,%r12
|
|
je .L_small_initial_num_blocks_is_6_966
|
|
jmp .L_small_initial_num_blocks_is_5_966
|
|
|
|
.L_small_initial_num_blocks_is_3_1_966:
|
|
|
|
cmpq $3,%r12
|
|
je .L_small_initial_num_blocks_is_3_966
|
|
cmpq $2,%r12
|
|
je .L_small_initial_num_blocks_is_2_966
|
|
|
|
|
|
|
|
|
|
|
|
.L_small_initial_num_blocks_is_1_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%xmm29
|
|
vpaddd ONE(%rip),%xmm2,%xmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm0,%xmm2
|
|
vpshufb %xmm29,%xmm0,%xmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %xmm15,%xmm0,%xmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %xmm15,%xmm0,%xmm0
|
|
vpxorq %xmm6,%xmm0,%xmm0
|
|
vextracti32x4 $0,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %xmm29,%xmm6,%xmm6
|
|
vextracti32x4 $0,%zmm6,%xmm13
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_967
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_967
|
|
.L_small_initial_partial_block_967:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
|
|
jmp .L_after_reduction_967
|
|
.L_small_initial_compute_done_967:
|
|
.L_after_reduction_967:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_2_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%ymm29
|
|
vshufi64x2 $0,%ymm2,%ymm2,%ymm0
|
|
vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm0,%xmm2
|
|
vpshufb %ymm29,%ymm0,%ymm0
|
|
vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %ymm15,%ymm0,%ymm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %ymm15,%ymm0,%ymm0
|
|
vpxorq %ymm6,%ymm0,%ymm0
|
|
vextracti32x4 $1,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %ymm29,%ymm6,%ymm6
|
|
vextracti32x4 $1,%zmm6,%xmm13
|
|
subq $16 * (2 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_968
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_968
|
|
.L_small_initial_partial_block_968:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_968:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_968
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_968:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_3_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $2,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $2,%zmm6,%xmm13
|
|
subq $16 * (3 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_969
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_969
|
|
.L_small_initial_partial_block_969:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_969:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_969
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_969:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_4_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm0,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vextracti32x4 $3,%zmm0,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm0,%zmm0{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vextracti32x4 $3,%zmm6,%xmm13
|
|
subq $16 * (4 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_970
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_970
|
|
.L_small_initial_partial_block_970:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_970:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_970
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_970:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_5_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %xmm29,%xmm3,%xmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %xmm15,%xmm3,%xmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %xmm15,%xmm3,%xmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %xmm7,%xmm3,%xmm3
|
|
vextracti32x4 $0,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %xmm29,%xmm7,%xmm7
|
|
vextracti32x4 $0,%zmm7,%xmm13
|
|
subq $16 * (5 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_971
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_971
|
|
.L_small_initial_partial_block_971:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_971:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_971
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_971:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_6_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %ymm29,%ymm3,%ymm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %ymm15,%ymm3,%ymm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %ymm15,%ymm3,%ymm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %ymm7,%ymm3,%ymm3
|
|
vextracti32x4 $1,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %ymm29,%ymm7,%ymm7
|
|
vextracti32x4 $1,%zmm7,%xmm13
|
|
subq $16 * (6 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_972
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_972
|
|
.L_small_initial_partial_block_972:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_972:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_972
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_972:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_7_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $2,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $2,%zmm7,%xmm13
|
|
subq $16 * (7 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_973
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_973
|
|
.L_small_initial_partial_block_973:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_973:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_973
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_973:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_8_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $64,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm3,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vextracti32x4 $3,%zmm3,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm3,%zmm3{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vextracti32x4 $3,%zmm7,%xmm13
|
|
subq $16 * (8 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_974
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_974
|
|
.L_small_initial_partial_block_974:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_974:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_974
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_974:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_9_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %xmm29,%xmm4,%xmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %xmm15,%xmm4,%xmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %xmm15,%xmm4,%xmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %xmm10,%xmm4,%xmm4
|
|
vextracti32x4 $0,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %xmm29,%xmm10,%xmm10
|
|
vextracti32x4 $0,%zmm10,%xmm13
|
|
subq $16 * (9 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_975
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_975
|
|
.L_small_initial_partial_block_975:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_975:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_975
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_975:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_10_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %ymm29,%ymm4,%ymm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %ymm15,%ymm4,%ymm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %ymm15,%ymm4,%ymm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %ymm10,%ymm4,%ymm4
|
|
vextracti32x4 $1,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %ymm29,%ymm10,%ymm10
|
|
vextracti32x4 $1,%zmm10,%xmm13
|
|
subq $16 * (10 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_976
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_976
|
|
.L_small_initial_partial_block_976:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_976:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_976
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_976:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_11_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $2,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $2,%zmm10,%xmm13
|
|
subq $16 * (11 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_977
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_977
|
|
.L_small_initial_partial_block_977:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_977:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_977
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_977:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_12_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $128,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm4,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vextracti32x4 $3,%zmm4,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm4,%zmm4{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vextracti32x4 $3,%zmm10,%xmm13
|
|
subq $16 * (12 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_978
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_978
|
|
.L_small_initial_partial_block_978:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vpxorq %zmm15,%zmm0,%zmm15
|
|
vpxorq %zmm16,%zmm3,%zmm16
|
|
vpxorq %zmm17,%zmm4,%zmm17
|
|
vpxorq %zmm19,%zmm5,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_978:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_978
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_978:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_13_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $0,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %xmm29,%xmm5,%xmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %xmm15,%xmm5,%xmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %xmm15,%xmm5,%xmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %xmm11,%xmm5,%xmm5
|
|
vextracti32x4 $0,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %xmm29,%xmm11,%xmm11
|
|
vextracti32x4 $0,%zmm11,%xmm13
|
|
subq $16 * (13 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_979
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_979
|
|
.L_small_initial_partial_block_979:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 160(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 224(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 288(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
|
|
vpxorq %zmm19,%zmm17,%zmm17
|
|
vpsrldq $8,%zmm17,%zmm4
|
|
vpslldq $8,%zmm17,%zmm5
|
|
vpxorq %zmm4,%zmm15,%zmm0
|
|
vpxorq %zmm5,%zmm16,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_979:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_979
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_979:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_14_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $1,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %ymm29,%ymm5,%ymm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %ymm15,%ymm5,%ymm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %ymm15,%ymm5,%ymm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %ymm11,%ymm5,%ymm5
|
|
vextracti32x4 $1,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %ymm29,%ymm11,%ymm11
|
|
vextracti32x4 $1,%zmm11,%xmm13
|
|
subq $16 * (14 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_980
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_980
|
|
.L_small_initial_partial_block_980:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 144(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 208(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 272(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 336(%rsi),%xmm20
|
|
vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
|
|
vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
|
|
vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
|
|
vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_980:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_980
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_980:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_15_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $2,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $2,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $2,%zmm11,%xmm13
|
|
subq $16 * (15 - 1),%r8
|
|
|
|
|
|
cmpq $16,%r8
|
|
jl .L_small_initial_partial_block_981
|
|
|
|
|
|
|
|
|
|
|
|
subq $16,%r8
|
|
movq $0,(%rdx)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
jmp .L_small_initial_compute_done_981
|
|
.L_small_initial_partial_block_981:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 128(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 192(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 256(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 320(%rsi),%ymm20
|
|
vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
|
|
vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
|
|
vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
|
|
vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_981:
|
|
|
|
orq %r8,%r8
|
|
je .L_after_reduction_981
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_981:
|
|
jmp .L_small_initial_blocks_encrypted_966
|
|
.L_small_initial_num_blocks_is_16_966:
|
|
vmovdqa64 SHUF_MASK(%rip),%zmm29
|
|
vshufi64x2 $0,%zmm2,%zmm2,%zmm2
|
|
vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
|
|
vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
|
|
vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
|
|
vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
|
|
leaq byte64_len_to_mask_table(%rip),%r10
|
|
movq %r8,%r15
|
|
subq $192,%r15
|
|
kmovq (%r10,%r15,8),%k1
|
|
vextracti32x4 $3,%zmm5,%xmm2
|
|
vpshufb %zmm29,%zmm0,%zmm0
|
|
vpshufb %zmm29,%zmm3,%zmm3
|
|
vpshufb %zmm29,%zmm4,%zmm4
|
|
vpshufb %zmm29,%zmm5,%zmm5
|
|
vmovdqu8 0(%rcx,%r11,1),%zmm6
|
|
vmovdqu8 64(%rcx,%r11,1),%zmm7
|
|
vmovdqu8 128(%rcx,%r11,1),%zmm10
|
|
vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
|
|
vbroadcastf64x2 0(%rdi),%zmm15
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm15,%zmm3,%zmm3
|
|
vpxorq %zmm15,%zmm4,%zmm4
|
|
vpxorq %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 16(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 32(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 48(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 64(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 80(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 96(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 112(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 128(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 144(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 160(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 176(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 192(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 208(%rdi),%zmm15
|
|
vaesenc %zmm15,%zmm0,%zmm0
|
|
vaesenc %zmm15,%zmm3,%zmm3
|
|
vaesenc %zmm15,%zmm4,%zmm4
|
|
vaesenc %zmm15,%zmm5,%zmm5
|
|
vbroadcastf64x2 224(%rdi),%zmm15
|
|
vaesenclast %zmm15,%zmm0,%zmm0
|
|
vaesenclast %zmm15,%zmm3,%zmm3
|
|
vaesenclast %zmm15,%zmm4,%zmm4
|
|
vaesenclast %zmm15,%zmm5,%zmm5
|
|
vpxorq %zmm6,%zmm0,%zmm0
|
|
vpxorq %zmm7,%zmm3,%zmm3
|
|
vpxorq %zmm10,%zmm4,%zmm4
|
|
vpxorq %zmm11,%zmm5,%zmm5
|
|
vextracti32x4 $3,%zmm5,%xmm12
|
|
movq %r9,%r10
|
|
vmovdqu8 %zmm0,0(%r10,%r11,1)
|
|
vmovdqu8 %zmm3,64(%r10,%r11,1)
|
|
vmovdqu8 %zmm4,128(%r10,%r11,1)
|
|
vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
|
|
vmovdqu8 %zmm5,%zmm5{%k1}{z}
|
|
vpshufb %zmm29,%zmm6,%zmm6
|
|
vpshufb %zmm29,%zmm7,%zmm7
|
|
vpshufb %zmm29,%zmm10,%zmm10
|
|
vpshufb %zmm29,%zmm11,%zmm11
|
|
vextracti32x4 $3,%zmm11,%xmm13
|
|
subq $16 * (16 - 1),%r8
|
|
.L_small_initial_partial_block_982:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
movq %r8,(%rdx)
|
|
vmovdqu64 %xmm12,16(%rsi)
|
|
vpxorq %zmm14,%zmm6,%zmm6
|
|
vmovdqu64 112(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
|
|
vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
|
|
vmovdqu64 176(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
|
|
vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
|
|
vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
|
|
vmovdqu64 240(%rsi),%zmm20
|
|
vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm0,%zmm6,%zmm15
|
|
vpternlogq $0x96,%zmm3,%zmm7,%zmm16
|
|
vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
|
|
vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
|
|
vpternlogq $0x96,%zmm4,%zmm6,%zmm17
|
|
vpternlogq $0x96,%zmm5,%zmm7,%zmm19
|
|
vmovdqu64 304(%rsi),%ymm20
|
|
vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
|
|
vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
|
|
vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
|
|
vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
|
|
vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
|
|
|
|
vpxorq %zmm17,%zmm4,%zmm4
|
|
vpxorq %zmm19,%zmm5,%zmm5
|
|
vpxorq %zmm15,%zmm0,%zmm0
|
|
vpxorq %zmm16,%zmm3,%zmm3
|
|
|
|
vpxorq %zmm5,%zmm4,%zmm4
|
|
vpsrldq $8,%zmm4,%zmm17
|
|
vpslldq $8,%zmm4,%zmm19
|
|
vpxorq %zmm17,%zmm0,%zmm0
|
|
vpxorq %zmm19,%zmm3,%zmm3
|
|
vextracti64x4 $1,%zmm0,%ymm17
|
|
vpxorq %ymm17,%ymm0,%ymm0
|
|
vextracti32x4 $1,%ymm0,%xmm17
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vextracti64x4 $1,%zmm3,%ymm19
|
|
vpxorq %ymm19,%ymm3,%ymm3
|
|
vextracti32x4 $1,%ymm3,%xmm19
|
|
vpxorq %xmm19,%xmm3,%xmm3
|
|
vmovdqa64 POLY2(%rip),%xmm20
|
|
|
|
|
|
vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm3,%xmm4
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
|
|
vpsrldq $4,%xmm5,%xmm5
|
|
vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
|
|
vpslldq $4,%xmm14,%xmm14
|
|
vpternlogq $0x96,%xmm0,%xmm5,%xmm14
|
|
|
|
.L_small_initial_compute_done_982:
|
|
vpxorq %xmm13,%xmm14,%xmm14
|
|
.L_after_reduction_982:
|
|
.L_small_initial_blocks_encrypted_966:
|
|
.L_ghash_done_821:
|
|
vmovdqu64 %xmm2,0(%rsi)
|
|
vmovdqu64 %xmm14,64(%rsi)
|
|
.L_enc_dec_done_821:
|
|
jmp .Lexit_gcm_decrypt
|
|
.Lexit_gcm_decrypt:
|
|
cmpq $256,%r8
|
|
jbe .Lskip_hkeys_cleanup_983
|
|
vpxor %xmm0,%xmm0,%xmm0
|
|
vmovdqa64 %zmm0,0(%rsp)
|
|
vmovdqa64 %zmm0,64(%rsp)
|
|
vmovdqa64 %zmm0,128(%rsp)
|
|
vmovdqa64 %zmm0,192(%rsp)
|
|
vmovdqa64 %zmm0,256(%rsp)
|
|
vmovdqa64 %zmm0,320(%rsp)
|
|
vmovdqa64 %zmm0,384(%rsp)
|
|
vmovdqa64 %zmm0,448(%rsp)
|
|
vmovdqa64 %zmm0,512(%rsp)
|
|
vmovdqa64 %zmm0,576(%rsp)
|
|
vmovdqa64 %zmm0,640(%rsp)
|
|
vmovdqa64 %zmm0,704(%rsp)
|
|
.Lskip_hkeys_cleanup_983:
|
|
vzeroupper
|
|
leaq (%rbp),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
popq %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r15
|
|
popq %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r14
|
|
popq %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r13
|
|
popq %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %r12
|
|
popq %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbp
|
|
popq %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
.cfi_restore %rbx
|
|
.byte 0xf3,0xc3
|
|
.Ldecrypt_seh_end:
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
|
|
.globl ossl_aes_gcm_finalize_avx512
|
|
.type ossl_aes_gcm_finalize_avx512,@function
|
|
.align 32
|
|
ossl_aes_gcm_finalize_avx512:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
vmovdqu 336(%rdi),%xmm2
|
|
vmovdqu 32(%rdi),%xmm3
|
|
vmovdqu 64(%rdi),%xmm4
|
|
|
|
|
|
cmpq $0,%rsi
|
|
je .L_partial_done_984
|
|
|
|
vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0
|
|
vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16
|
|
vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17
|
|
vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4
|
|
vpxorq %xmm17,%xmm4,%xmm4
|
|
|
|
vpsrldq $8,%xmm4,%xmm17
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vpxorq %xmm16,%xmm4,%xmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm17
|
|
|
|
vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16
|
|
vpslldq $8,%xmm16,%xmm16
|
|
vpxorq %xmm16,%xmm4,%xmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16
|
|
vpsrldq $4,%xmm16,%xmm16
|
|
vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4
|
|
vpslldq $4,%xmm4,%xmm4
|
|
|
|
vpternlogq $0x96,%xmm16,%xmm0,%xmm4
|
|
|
|
.L_partial_done_984:
|
|
vmovq 56(%rdi),%xmm5
|
|
vpinsrq $1,48(%rdi),%xmm5,%xmm5
|
|
vpsllq $3,%xmm5,%xmm5
|
|
|
|
vpxor %xmm5,%xmm4,%xmm4
|
|
|
|
vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0
|
|
vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16
|
|
vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17
|
|
vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4
|
|
vpxorq %xmm17,%xmm4,%xmm4
|
|
|
|
vpsrldq $8,%xmm4,%xmm17
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm17,%xmm0,%xmm0
|
|
vpxorq %xmm16,%xmm4,%xmm4
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm17
|
|
|
|
vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16
|
|
vpslldq $8,%xmm16,%xmm16
|
|
vpxorq %xmm16,%xmm4,%xmm4
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16
|
|
vpsrldq $4,%xmm16,%xmm16
|
|
vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4
|
|
vpslldq $4,%xmm4,%xmm4
|
|
|
|
vpternlogq $0x96,%xmm16,%xmm0,%xmm4
|
|
|
|
vpshufb SHUF_MASK(%rip),%xmm4,%xmm4
|
|
vpxor %xmm4,%xmm3,%xmm3
|
|
|
|
.L_return_T_984:
|
|
vmovdqu %xmm3,64(%rdi)
|
|
.Labort_finalize:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
|
|
.globl ossl_gcm_gmult_avx512
|
|
.hidden ossl_gcm_gmult_avx512
|
|
.type ossl_gcm_gmult_avx512,@function
|
|
.align 32
|
|
ossl_gcm_gmult_avx512:
|
|
.cfi_startproc
|
|
.byte 243,15,30,250
|
|
vmovdqu64 (%rdi),%xmm1
|
|
vmovdqu64 336(%rsi),%xmm2
|
|
|
|
vpclmulqdq $0x11,%xmm2,%xmm1,%xmm3
|
|
vpclmulqdq $0x00,%xmm2,%xmm1,%xmm4
|
|
vpclmulqdq $0x01,%xmm2,%xmm1,%xmm5
|
|
vpclmulqdq $0x10,%xmm2,%xmm1,%xmm1
|
|
vpxorq %xmm5,%xmm1,%xmm1
|
|
|
|
vpsrldq $8,%xmm1,%xmm5
|
|
vpslldq $8,%xmm1,%xmm1
|
|
vpxorq %xmm5,%xmm3,%xmm3
|
|
vpxorq %xmm4,%xmm1,%xmm1
|
|
|
|
|
|
|
|
vmovdqu64 POLY2(%rip),%xmm5
|
|
|
|
vpclmulqdq $0x01,%xmm1,%xmm5,%xmm4
|
|
vpslldq $8,%xmm4,%xmm4
|
|
vpxorq %xmm4,%xmm1,%xmm1
|
|
|
|
|
|
|
|
vpclmulqdq $0x00,%xmm1,%xmm5,%xmm4
|
|
vpsrldq $4,%xmm4,%xmm4
|
|
vpclmulqdq $0x10,%xmm1,%xmm5,%xmm1
|
|
vpslldq $4,%xmm1,%xmm1
|
|
|
|
vpternlogq $0x96,%xmm4,%xmm3,%xmm1
|
|
|
|
vmovdqu64 %xmm1,(%rdi)
|
|
vzeroupper
|
|
.Labort_gmult:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
|
|
.section .rodata
|
|
.align 16
|
|
POLY:.quad 0x0000000000000001, 0xC200000000000000
|
|
|
|
.align 64
|
|
POLY2:
|
|
.quad 0x00000001C2000000, 0xC200000000000000
|
|
.quad 0x00000001C2000000, 0xC200000000000000
|
|
.quad 0x00000001C2000000, 0xC200000000000000
|
|
.quad 0x00000001C2000000, 0xC200000000000000
|
|
|
|
.align 16
|
|
TWOONE:.quad 0x0000000000000001, 0x0000000100000000
|
|
|
|
|
|
|
|
.align 64
|
|
SHUF_MASK:
|
|
.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
|
|
.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
|
|
.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
|
|
.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
|
|
|
|
.align 16
|
|
SHIFT_MASK:
|
|
.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
|
|
|
|
ALL_F:
|
|
.quad 0xffffffffffffffff, 0xffffffffffffffff
|
|
|
|
ZERO:
|
|
.quad 0x0000000000000000, 0x0000000000000000
|
|
|
|
.align 16
|
|
ONE:
|
|
.quad 0x0000000000000001, 0x0000000000000000
|
|
|
|
.align 16
|
|
ONEf:
|
|
.quad 0x0000000000000000, 0x0100000000000000
|
|
|
|
.align 64
|
|
ddq_add_1234:
|
|
.quad 0x0000000000000001, 0x0000000000000000
|
|
.quad 0x0000000000000002, 0x0000000000000000
|
|
.quad 0x0000000000000003, 0x0000000000000000
|
|
.quad 0x0000000000000004, 0x0000000000000000
|
|
|
|
.align 64
|
|
ddq_add_5678:
|
|
.quad 0x0000000000000005, 0x0000000000000000
|
|
.quad 0x0000000000000006, 0x0000000000000000
|
|
.quad 0x0000000000000007, 0x0000000000000000
|
|
.quad 0x0000000000000008, 0x0000000000000000
|
|
|
|
.align 64
|
|
ddq_add_4444:
|
|
.quad 0x0000000000000004, 0x0000000000000000
|
|
.quad 0x0000000000000004, 0x0000000000000000
|
|
.quad 0x0000000000000004, 0x0000000000000000
|
|
.quad 0x0000000000000004, 0x0000000000000000
|
|
|
|
.align 64
|
|
ddq_add_8888:
|
|
.quad 0x0000000000000008, 0x0000000000000000
|
|
.quad 0x0000000000000008, 0x0000000000000000
|
|
.quad 0x0000000000000008, 0x0000000000000000
|
|
.quad 0x0000000000000008, 0x0000000000000000
|
|
|
|
.align 64
|
|
ddq_addbe_1234:
|
|
.quad 0x0000000000000000, 0x0100000000000000
|
|
.quad 0x0000000000000000, 0x0200000000000000
|
|
.quad 0x0000000000000000, 0x0300000000000000
|
|
.quad 0x0000000000000000, 0x0400000000000000
|
|
|
|
.align 64
|
|
ddq_addbe_4444:
|
|
.quad 0x0000000000000000, 0x0400000000000000
|
|
.quad 0x0000000000000000, 0x0400000000000000
|
|
.quad 0x0000000000000000, 0x0400000000000000
|
|
.quad 0x0000000000000000, 0x0400000000000000
|
|
|
|
.align 64
|
|
byte_len_to_mask_table:
|
|
.value 0x0000, 0x0001, 0x0003, 0x0007
|
|
.value 0x000f, 0x001f, 0x003f, 0x007f
|
|
.value 0x00ff, 0x01ff, 0x03ff, 0x07ff
|
|
.value 0x0fff, 0x1fff, 0x3fff, 0x7fff
|
|
.value 0xffff
|
|
|
|
.align 64
|
|
byte64_len_to_mask_table:
|
|
.quad 0x0000000000000000, 0x0000000000000001
|
|
.quad 0x0000000000000003, 0x0000000000000007
|
|
.quad 0x000000000000000f, 0x000000000000001f
|
|
.quad 0x000000000000003f, 0x000000000000007f
|
|
.quad 0x00000000000000ff, 0x00000000000001ff
|
|
.quad 0x00000000000003ff, 0x00000000000007ff
|
|
.quad 0x0000000000000fff, 0x0000000000001fff
|
|
.quad 0x0000000000003fff, 0x0000000000007fff
|
|
.quad 0x000000000000ffff, 0x000000000001ffff
|
|
.quad 0x000000000003ffff, 0x000000000007ffff
|
|
.quad 0x00000000000fffff, 0x00000000001fffff
|
|
.quad 0x00000000003fffff, 0x00000000007fffff
|
|
.quad 0x0000000000ffffff, 0x0000000001ffffff
|
|
.quad 0x0000000003ffffff, 0x0000000007ffffff
|
|
.quad 0x000000000fffffff, 0x000000001fffffff
|
|
.quad 0x000000003fffffff, 0x000000007fffffff
|
|
.quad 0x00000000ffffffff, 0x00000001ffffffff
|
|
.quad 0x00000003ffffffff, 0x00000007ffffffff
|
|
.quad 0x0000000fffffffff, 0x0000001fffffffff
|
|
.quad 0x0000003fffffffff, 0x0000007fffffffff
|
|
.quad 0x000000ffffffffff, 0x000001ffffffffff
|
|
.quad 0x000003ffffffffff, 0x000007ffffffffff
|
|
.quad 0x00000fffffffffff, 0x00001fffffffffff
|
|
.quad 0x00003fffffffffff, 0x00007fffffffffff
|
|
.quad 0x0000ffffffffffff, 0x0001ffffffffffff
|
|
.quad 0x0003ffffffffffff, 0x0007ffffffffffff
|
|
.quad 0x000fffffffffffff, 0x001fffffffffffff
|
|
.quad 0x003fffffffffffff, 0x007fffffffffffff
|
|
.quad 0x00ffffffffffffff, 0x01ffffffffffffff
|
|
.quad 0x03ffffffffffffff, 0x07ffffffffffffff
|
|
.quad 0x0fffffffffffffff, 0x1fffffffffffffff
|
|
.quad 0x3fffffffffffffff, 0x7fffffffffffffff
|
|
.quad 0xffffffffffffffff
|
|
.section ".note.gnu.property", "a"
|
|
.p2align 3
|
|
.long 1f - 0f
|
|
.long 4f - 1f
|
|
.long 5
|
|
0:
|
|
# "GNU" encoded with .byte, since .asciz isn't supported
|
|
# on Solaris.
|
|
.byte 0x47
|
|
.byte 0x4e
|
|
.byte 0x55
|
|
.byte 0
|
|
1:
|
|
.p2align 3
|
|
.long 0xc0000002
|
|
.long 3f - 2f
|
|
2:
|
|
.long 3
|
|
3:
|
|
.p2align 3
|
|
4:
|