x86 memcmp 4+2+1 vs 4+4 for 7

Created Diff never expires
22 removals
204 lines
16 additions
198 lines
.LCPI0_0:
.LCPI0_0:
.byte 15 # 0xf
.byte 15 # 0xf
.byte 14 # 0xe
.byte 14 # 0xe
.byte 13 # 0xd
.byte 13 # 0xd
.byte 12 # 0xc
.byte 12 # 0xc
.byte 11 # 0xb
.byte 11 # 0xb
.byte 10 # 0xa
.byte 10 # 0xa
.byte 9 # 0x9
.byte 9 # 0x9
.byte 8 # 0x8
.byte 8 # 0x8
.byte 7 # 0x7
.byte 7 # 0x7
.byte 6 # 0x6
.byte 6 # 0x6
.byte 5 # 0x5
.byte 5 # 0x5
.byte 4 # 0x4
.byte 4 # 0x4
.byte 3 # 0x3
.byte 3 # 0x3
.byte 2 # 0x2
.byte 2 # 0x2
.byte 1 # 0x1
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
memcmp: # @memcmp
memcmp: # @memcmp
xor eax, eax
xor eax, eax
cmp rdx, 8
cmp rdx, 8
ja .LBB0_19
ja .LBB0_19
lea rcx, [rip + .LJTI0_0]
lea rcx, [rip + .LJTI0_0]
movsxd rdx, dword ptr [rcx + 4*rdx]
movsxd rdx, dword ptr [rcx + 4*rdx]
add rdx, rcx
add rdx, rcx
jmp rdx
jmp rdx
.LBB0_2:
.LBB0_2:
movzx eax, byte ptr [rdi]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rsi]
movzx ecx, byte ptr [rsi]
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_19:
.LBB0_19:
cmp rdx, 16
cmp rdx, 16
ja .LBB0_23
ja .LBB0_23
mov rcx, qword ptr [rdi]
mov rcx, qword ptr [rdi]
mov r8, qword ptr [rsi]
mov r8, qword ptr [rsi]
cmp rcx, r8
cmp rcx, r8
jne .LBB0_21
jne .LBB0_21
mov rcx, qword ptr [rdi + rdx - 8]
mov rcx, qword ptr [rdi + rdx - 8]
mov rdx, qword ptr [rsi + rdx - 8]
mov rdx, qword ptr [rsi + rdx - 8]
cmp rcx, rdx
cmp rcx, rdx
je .LBB0_34
je .LBB0_34
jmp .LBB0_17
jmp .LBB0_17
.LBB0_3:
.LBB0_3:
movbe ax, word ptr [rdi]
movbe ax, word ptr [rdi]
movzx eax, ax
movzx eax, ax
movbe cx, word ptr [rsi]
movbe cx, word ptr [rsi]
movzx ecx, cx
movzx ecx, cx
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_5:
.LBB0_5:
movbe ax, word ptr [rdi]
movbe ax, word ptr [rdi]
movzx eax, ax
movzx eax, ax
movbe cx, word ptr [rsi]
movbe cx, word ptr [rsi]
movzx ecx, cx
movzx ecx, cx
sub eax, ecx
sub eax, ecx
je .LBB0_6
je .LBB0_6
.LBB0_34:
.LBB0_34:
ret
ret
.LBB0_7:
.LBB0_7:
movbe eax, dword ptr [rdi]
movbe eax, dword ptr [rdi]
movbe ecx, dword ptr [rsi]
movbe ecx, dword ptr [rsi]
Text moved to lines 100-103
sub rax, rcx
jmp .LBB0_8
movzx ecx, ax
.LBB0_9:
shr rax
or eax, ecx
ret
.LBB0_8:
movbe eax, dword ptr [rdi]
movbe eax, dword ptr [rdi]
movbe ecx, dword ptr [rsi]
movbe ecx, dword ptr [rsi]
sub rax, rcx
sub rax, rcx
movzx ecx, ax
movzx ecx, ax
shr rax
shr rax
or eax, ecx
or eax, ecx
jne .LBB0_34
jne .LBB0_34
movzx eax, byte ptr [rdi + 4]
movzx eax, byte ptr [rdi + 4]
movzx ecx, byte ptr [rsi + 4]
movzx ecx, byte ptr [rsi + 4]
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_10:
.LBB0_11:
movbe eax, dword ptr [rdi]
movbe eax, dword ptr [rdi]
movbe ecx, dword ptr [rsi]
movbe ecx, dword ptr [rsi]
sub rax, rcx
sub rax, rcx
movzx ecx, ax
movzx ecx, ax
shr rax
shr rax
or eax, ecx
or eax, ecx
jne .LBB0_34
jne .LBB0_34
movbe ax, word ptr [rdi + 4]
movbe ax, word ptr [rdi + 4]
movzx eax, ax
movzx eax, ax
movbe cx, word ptr [rsi + 4]
movbe cx, word ptr [rsi + 4]
movzx ecx, cx
movzx ecx, cx
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_12:
.LBB0_13:
movbe eax, dword ptr [rdi]
movbe eax, dword ptr [rdi]
movbe ecx, dword ptr [rsi]
movbe ecx, dword ptr [rsi]
sub rax, rcx
sub rax, rcx
movzx ecx, ax
movzx ecx, ax
shr rax
shr rax
or eax, ecx
or eax, ecx
jne .LBB0_34
jne .LBB0_34
movbe ax, word ptr [rdi + 4]
movbe eax, dword ptr [rdi + 3]
movzx eax, ax
movbe ecx, dword ptr [rsi + 3]
movbe cx, word ptr [rsi + 4]
.LBB0_8:
Text moved from lines 62-65
movzx ecx, cx
sub rax, rcx
sub eax, ecx
movzx ecx, ax
jne .LBB0_34
shr rax
movzx eax, byte ptr [rdi + 6]
or eax, ecx
movzx ecx, byte ptr [rsi + 6]
sub eax, ecx
ret
ret
.LBB0_15:
.LBB0_15:
mov rcx, qword ptr [rdi]
mov rcx, qword ptr [rdi]
mov rdx, qword ptr [rsi]
mov rdx, qword ptr [rsi]
cmp rcx, rdx
cmp rcx, rdx
je .LBB0_34
je .LBB0_34
.LBB0_17:
.LBB0_17:
bswap rcx
bswap rcx
bswap rdx
bswap rdx
xor eax, eax
xor eax, eax
cmp rcx, rdx
cmp rcx, rdx
sbb eax, eax
sbb eax, eax
or eax, 1
or eax, 1
ret
ret
.LBB0_23:
.LBB0_23:
cmp rdx, 384
cmp rdx, 384
jae .LBB0_24
jae .LBB0_24
.LBB0_28:
.LBB0_28:
add rdx, -16
add rdx, -16
xor eax, eax
xor eax, eax
.LBB0_29: # =>This Inner Loop Header: Depth=1
.LBB0_29: # =>This Inner Loop Header: Depth=1
vmovdqu xmm0, xmmword ptr [rdi + rax]
vmovdqu xmm0, xmmword ptr [rdi + rax]
vmovdqu xmm1, xmmword ptr [rsi + rax]
vmovdqu xmm1, xmmword ptr [rsi + rax]
vpxor xmm2, xmm1, xmm0
vpxor xmm2, xmm1, xmm0
vptest xmm2, xmm2
vptest xmm2, xmm2
jne .LBB0_32
jne .LBB0_32
add rax, 16
add rax, 16
cmp rax, rdx
cmp rax, rdx
jb .LBB0_29
jb .LBB0_29
vmovdqu xmm0, xmmword ptr [rdi + rdx]
vmovdqu xmm0, xmmword ptr [rdi + rdx]
vmovdqu xmm1, xmmword ptr [rsi + rdx]
vmovdqu xmm1, xmmword ptr [rsi + rdx]
vpxor xmm2, xmm1, xmm0
vpxor xmm2, xmm1, xmm0
xor eax, eax
xor eax, eax
vptest xmm2, xmm2
vptest xmm2, xmm2
je .LBB0_34
je .LBB0_34
.LBB0_32:
.LBB0_32:
vpmaxub xmm2, xmm1, xmm0
vpmaxub xmm2, xmm1, xmm0
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vmovdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
vmovdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
vpshufb xmm1, xmm1, xmm3
vpshufb xmm1, xmm1, xmm3
vpmovmskb ecx, xmm1
vpmovmskb ecx, xmm1
vpcmpeqb xmm0, xmm0, xmm2
vpcmpeqb xmm0, xmm0, xmm2
vpshufb xmm0, xmm0, xmm3
vpshufb xmm0, xmm0, xmm3
vpmovmskb eax, xmm0
vpmovmskb eax, xmm0
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_6:
.LBB0_6:
movzx eax, byte ptr [rdi + 2]
movzx eax, byte ptr [rdi + 2]
movzx ecx, byte ptr [rsi + 2]
movzx ecx, byte ptr [rsi + 2]
sub eax, ecx
sub eax, ecx
ret
ret
.LBB0_21:
.LBB0_21:
bswap rcx
bswap rcx
bswap r8
bswap r8
xor eax, eax
xor eax, eax
cmp rcx, r8
cmp rcx, r8
sbb eax, eax
sbb eax, eax
or eax, 1
or eax, 1
ret
ret
.LBB0_24:
.LBB0_24:
mov ecx, edi
mov ecx, edi
and ecx, 15
and ecx, 15
je .LBB0_28
je .LBB0_28
vmovdqu xmm0, xmmword ptr [rdi]
vmovdqu xmm0, xmmword ptr [rdi]
vmovdqu xmm1, xmmword ptr [rsi]
vmovdqu xmm1, xmmword ptr [rsi]
vpxor xmm2, xmm1, xmm0
vpxor xmm2, xmm1, xmm0
vptest xmm2, xmm2
vptest xmm2, xmm2
je .LBB0_27
je .LBB0_27
vpmaxub xmm2, xmm1, xmm0
vpmaxub xmm2, xmm1, xmm0
vpcmpeqb xmm1, xmm1, xmm2
vpcmpeqb xmm1, xmm1, xmm2
vmovdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
vmovdqa xmm3, xmmword ptr [rip + .LCPI0_0] # xmm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
vpshufb xmm1, xmm1, xmm3
vpshufb xmm1, xmm1, xmm3
vpmovmskb r8d, xmm1
vpmovmskb r8d, xmm1
vpcmpeqb xmm0, xmm0, xmm2
vpcmpeqb xmm0, xmm0, xmm2
vpshufb xmm0, xmm0, xmm3
vpshufb xmm0, xmm0, xmm3
vpmovmskb eax, xmm0
vpmovmskb eax, xmm0
sub eax, r8d
sub eax, r8d
jne .LBB0_34
jne .LBB0_34
.LBB0_27:
.LBB0_27:
mov eax, 16
mov eax, 16
sub rax, rcx
sub rax, rcx
add rdi, rax
add rdi, rax
add rsi, rax
add rsi, rax
sub rdx, rax
sub rdx, rax
jmp .LBB0_28
jmp .LBB0_28
.LJTI0_0:
.LJTI0_0:
.long .LBB0_34-.LJTI0_0
.long .LBB0_34-.LJTI0_0
.long .LBB0_2-.LJTI0_0
.long .LBB0_2-.LJTI0_0
.long .LBB0_3-.LJTI0_0
.long .LBB0_3-.LJTI0_0
.long .LBB0_5-.LJTI0_0
.long .LBB0_5-.LJTI0_0
.long .LBB0_7-.LJTI0_0
.long .LBB0_7-.LJTI0_0
.long .LBB0_8-.LJTI0_0
.long .LBB0_9-.LJTI0_0
.long .LBB0_10-.LJTI0_0
.long .LBB0_11-.LJTI0_0
.long .LBB0_12-.LJTI0_0
.long .LBB0_13-.LJTI0_0
.long .LBB0_15-.LJTI0_0
.long .LBB0_15-.LJTI0_0