Merge pull request #2825 from terrelln/huf-asm-comments

[asm] Switch to C style comments
dev
Nick Terrell 2021-10-20 18:06:37 -07:00 committed by GitHub
commit dad8a3cf34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 456 additions and 446 deletions

View File

@ -1,24 +1,27 @@
# Calling convention:
#
# %rdi contains the first argument: HUF_DecompressAsmArgs*.
# %rbp is'nt maintained (no frame pointer).
# %rsp contains the stack pointer that grows down.
# No red-zone is assumed, only addresses >= %rsp are used.
# All register contents are preserved.
#
# TODO: Support Windows calling convention.
#if !defined(HUF_DISABLE_ASM) && defined(__x86_64__) #if !defined(HUF_DISABLE_ASM) && defined(__x86_64__)
/* Calling convention:
*
* %rdi contains the first argument: HUF_DecompressAsmArgs*.
* %rbp is'nt maintained (no frame pointer).
* %rsp contains the stack pointer that grows down.
* No red-zone is assumed, only addresses >= %rsp are used.
* All register contents are preserved.
*
* TODO: Support Windows calling convention.
*/
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
.text .text
# Sets up register mappings for clarity. /* Sets up register mappings for clarity.
# op[], bits[], dtable & ip[0] each get their own register. * op[], bits[], dtable & ip[0] each get their own register.
# ip[1,2,3] & olimit alias var[]. * ip[1,2,3] & olimit alias var[].
# %rax is a scratch register. * %rax is a scratch register.
*/
#define op0 rsi #define op0 rsi
#define op1 rbx #define op1 rbx
@ -37,21 +40,22 @@
#define dtable r14 #define dtable r14
#define olimit r15 #define olimit r15
# var[] aliases ip[1,2,3] & olimit /* var[] aliases ip[1,2,3] & olimit
# ip[1,2,3] are saved every iteration. * ip[1,2,3] are saved every iteration.
# olimit is only used in compute_olimit. * olimit is only used in compute_olimit.
*/
#define var0 r15 #define var0 r15
#define var1 r9 #define var1 r9
#define var2 r10 #define var2 r10
#define var3 r11 #define var3 r11
# 32-bit var registers /* 32-bit var registers */
#define vard0 r15d #define vard0 r15d
#define vard1 r9d #define vard1 r9d
#define vard2 r10d #define vard2 r10d
#define vard3 r11d #define vard3 r11d
# Helper macro: args if idx != 4. /* Helper macro: args if idx != 4. */
#define IF_NOT_4_0(...) __VA_ARGS__ #define IF_NOT_4_0(...) __VA_ARGS__
#define IF_NOT_4_1(...) __VA_ARGS__ #define IF_NOT_4_1(...) __VA_ARGS__
#define IF_NOT_4_2(...) __VA_ARGS__ #define IF_NOT_4_2(...) __VA_ARGS__
@ -60,25 +64,26 @@
#define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__) #define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__)
#define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__) #define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__)
# Calls X(N) for each stream 0, 1, 2, 3. /* Calls X(N) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM(X) \ #define FOR_EACH_STREAM(X) \
X(0); \ X(0); \
X(1); \ X(1); \
X(2); \ X(2); \
X(3) X(3)
# Calls X(N, idx) for each stream 0, 1, 2, 3. /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
X(0, idx); \ X(0, idx); \
X(1, idx); \ X(1, idx); \
X(2, idx); \ X(2, idx); \
X(3, idx) X(3, idx)
# Define both _HUF_* & HUF_* symbols because MacOS /* Define both _HUF_* & HUF_* symbols because MacOS
# C symbols are prefixed with '_' & Linux symbols aren't. * C symbols are prefixed with '_' & Linux symbols aren't.
*/
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
# Save all registers - even if they are callee saved for simplicity. /* Save all registers - even if they are callee saved for simplicity. */
push %rax push %rax
push %rbx push %rbx
push %rcx push %rcx
@ -95,7 +100,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
push %r14 push %r14
push %r15 push %r15
# Read HUF_DecompressAsmArgs* args from %rax /* Read HUF_DecompressAsmArgs* args from %rax */
movq %rdi, %rax movq %rdi, %rax
movq 0(%rax), %ip0 movq 0(%rax), %ip0
movq 8(%rax), %ip1 movq 8(%rax), %ip1
@ -110,36 +115,37 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
movq 80(%rax), %bits2 movq 80(%rax), %bits2
movq 88(%rax), %bits3 movq 88(%rax), %bits3
movq 96(%rax), %dtable movq 96(%rax), %dtable
push %rax # argument push %rax /* argument */
push 104(%rax) # ilimit push 104(%rax) /* ilimit */
push 112(%rax) # oend push 112(%rax) /* oend */
push %olimit # olimit space push %olimit /* olimit space */
subq $24, %rsp subq $24, %rsp
.L_4X1_compute_olimit: .L_4X1_compute_olimit:
# Computes how many iterations we can do savely /* Computes how many iterations we can do savely
# %r15, %rax may be clobbered * %r15, %rax may be clobbered
# rbx, rdx must be saved * rbx, rdx must be saved
# op3 & ip0 mustn't be clobbered * op3 & ip0 mustn't be clobbered
*/
movq %rbx, 0(%rsp) movq %rbx, 0(%rsp)
movq %rdx, 8(%rsp) movq %rdx, 8(%rsp)
movq 32(%rsp), %rax # rax = oend movq 32(%rsp), %rax /* rax = oend */
subq %op3, %rax # rax = oend - op3 subq %op3, %rax /* rax = oend - op3 */
# r15 = (oend - op3) / 5 /* r15 = (oend - op3) / 5 */
movabsq $-3689348814741910323, %rdx movabsq $-3689348814741910323, %rdx
mulq %rdx mulq %rdx
movq %rdx, %r15 movq %rdx, %r15
shrq $2, %r15 shrq $2, %r15
movq %ip0, %rax # rax = ip0 movq %ip0, %rax /* rax = ip0 */
movq 40(%rsp), %rdx # rdx = ilimit movq 40(%rsp), %rdx /* rdx = ilimit */
subq %rdx, %rax # rax = ip0 - ilimit subq %rdx, %rax /* rax = ip0 - ilimit */
movq %rax, %rbx # rbx = ip0 - ilimit movq %rax, %rbx /* rbx = ip0 - ilimit */
# rdx = (ip0 - ilimit) / 7 /* rdx = (ip0 - ilimit) / 7 */
movabsq $2635249153387078803, %rdx movabsq $2635249153387078803, %rdx
mulq %rdx mulq %rdx
subq %rdx, %rbx subq %rdx, %rbx
@ -147,70 +153,74 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
addq %rbx, %rdx addq %rbx, %rdx
shrq $2, %rdx shrq $2, %rdx
# r15 = min(%rdx, %r15) /* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15 cmpq %rdx, %r15
cmova %rdx, %r15 cmova %rdx, %r15
# r15 = r15 * 5 /* r15 = r15 * 5 */
leaq (%r15, %r15, 4), %r15 leaq (%r15, %r15, 4), %r15
# olimit = op3 + r15 /* olimit = op3 + r15 */
addq %op3, %olimit addq %op3, %olimit
movq 8(%rsp), %rdx movq 8(%rsp), %rdx
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
# If (op3 + 20 > olimit) /* If (op3 + 20 > olimit) */
movq %op3, %rax # rax = op3 movq %op3, %rax /* rax = op3 */
addq $20, %rax # rax = op3 + 20 addq $20, %rax /* rax = op3 + 20 */
cmpq %rax, %olimit # op3 + 20 > olimit cmpq %rax, %olimit /* op3 + 20 > olimit */
jb .L_4X1_exit jb .L_4X1_exit
# If (ip1 < ip0) go to exit /* If (ip1 < ip0) go to exit */
cmpq %ip0, %ip1 cmpq %ip0, %ip1
jb .L_4X1_exit jb .L_4X1_exit
# If (ip2 < ip1) go to exit /* If (ip2 < ip1) go to exit */
cmpq %ip1, %ip2 cmpq %ip1, %ip2
jb .L_4X1_exit jb .L_4X1_exit
# If (ip3 < ip2) go to exit /* If (ip3 < ip2) go to exit */
cmpq %ip2, %ip3 cmpq %ip2, %ip3
jb .L_4X1_exit jb .L_4X1_exit
# Reads top 11 bits from bits[n] /* Reads top 11 bits from bits[n]
# Loads dt[bits[n]] into var[n] * Loads dt[bits[n]] into var[n]
*/
#define GET_NEXT_DELT(n) \ #define GET_NEXT_DELT(n) \
movq $53, %var##n; \ movq $53, %var##n; \
shrxq %var##n, %bits##n, %var##n; \ shrxq %var##n, %bits##n, %var##n; \
movzwl (%dtable,%var##n,2),%vard##n movzwl (%dtable,%var##n,2),%vard##n
# var[n] must contain the DTable entry computed with GET_NEXT_DELT /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
# Moves var[n] to %rax * Moves var[n] to %rax
# bits[n] <<= var[n] & 63 * bits[n] <<= var[n] & 63
# op[n][idx] = %rax >> 8 * op[n][idx] = %rax >> 8
# %ah is a way to access bits [8, 16) of %rax * %ah is a way to access bits [8, 16) of %rax
*/
#define DECODE_FROM_DELT(n, idx) \ #define DECODE_FROM_DELT(n, idx) \
movq %var##n, %rax; \ movq %var##n, %rax; \
shlxq %var##n, %bits##n, %bits##n; \ shlxq %var##n, %bits##n, %bits##n; \
movb %ah, idx(%op##n) movb %ah, idx(%op##n)
# Assumes GET_NEXT_DELT has been called. /* Assumes GET_NEXT_DELT has been called.
# Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4 * Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4
*/
#define DECODE(n, idx) \ #define DECODE(n, idx) \
DECODE_FROM_DELT(n, idx); \ DECODE_FROM_DELT(n, idx); \
IF_NOT_4(idx, GET_NEXT_DELT(n)) IF_NOT_4(idx, GET_NEXT_DELT(n))
# // ctz & nbBytes is stored in bits[n] /* // ctz & nbBytes is stored in bits[n]
# // nbBits is stored in %rax * // nbBits is stored in %rax
# ctz = CTZ[bits[n]] * ctz = CTZ[bits[n]]
# nbBits = ctz & 7 * nbBits = ctz & 7
# nbBytes = ctz >> 3 * nbBytes = ctz >> 3
# op[n] += 5 * op[n] += 5
# ip[n] -= nbBytes * ip[n] -= nbBytes
# // Note: x86-64 is little-endian ==> no bswap * // Note: x86-64 is little-endian ==> no bswap
# bits[n] = MEM_readST(ip[n]) | 1 * bits[n] = MEM_readST(ip[n]) | 1
# bits[n] <<= nbBits * bits[n] <<= nbBits
*/
#define RELOAD_BITS(n) \ #define RELOAD_BITS(n) \
bsfq %bits##n, %bits##n; \ bsfq %bits##n, %bits##n; \
movq %bits##n, %rax; \ movq %bits##n, %rax; \
@ -220,38 +230,40 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
subq %bits##n, %ip##n; \ subq %bits##n, %ip##n; \
movq (%ip##n), %bits##n; \ movq (%ip##n), %bits##n; \
orq $1, %bits##n; \ orq $1, %bits##n; \
shlx %rax, %bits##n, %bits##n; shlx %rax, %bits##n, %bits##n
# Store clobbered variables on the stack /* Store clobbered variables on the stack */
movq %olimit, 24(%rsp) movq %olimit, 24(%rsp)
movq %ip1, 0(%rsp) movq %ip1, 0(%rsp)
movq %ip2, 8(%rsp) movq %ip2, 8(%rsp)
movq %ip3, 16(%rsp) movq %ip3, 16(%rsp)
# Call GET_NEXT_DELT for each stream /* Call GET_NEXT_DELT for each stream */
FOR_EACH_STREAM(GET_NEXT_DELT) FOR_EACH_STREAM(GET_NEXT_DELT)
.p2align 6 .p2align 6
.L_4X1_loop_body: .L_4X1_loop_body:
# LLVM-MCA-BEGIN decode-4X1 /* Decode 5 symbols in each of the 4 streams (20 total)
# Decode 5 symbols in each of the 4 streams (20 total) * Must have called GET_NEXT_DELT for each stream
# Must have called GET_NEXT_DELT for each stream */
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
# Load ip[1,2,3] from stack (var[] aliases them) /* Load ip[1,2,3] from stack (var[] aliases them)
# ip[] is needed for RELOAD_BITS * ip[] is needed for RELOAD_BITS
# Each will be stored back to the stack after RELOAD * Each will be stored back to the stack after RELOAD
*/
movq 0(%rsp), %ip1 movq 0(%rsp), %ip1
movq 8(%rsp), %ip2 movq 8(%rsp), %ip2
movq 16(%rsp), %ip3 movq 16(%rsp), %ip3
# Reload each stream & fetch the next table entry /* Reload each stream & fetch the next table entry
# to prepare for the next iteration * to prepare for the next iteration
*/
RELOAD_BITS(0) RELOAD_BITS(0)
GET_NEXT_DELT(0) GET_NEXT_DELT(0)
@ -267,33 +279,32 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
movq %ip3, 16(%rsp) movq %ip3, 16(%rsp)
GET_NEXT_DELT(3) GET_NEXT_DELT(3)
# If op3 < olimit: continue the loop /* If op3 < olimit: continue the loop */
cmp %op3, 24(%rsp) cmp %op3, 24(%rsp)
ja .L_4X1_loop_body ja .L_4X1_loop_body
# Reload ip[1,2,3] from stack /* Reload ip[1,2,3] from stack */
movq 0(%rsp), %ip1 movq 0(%rsp), %ip1
movq 8(%rsp), %ip2 movq 8(%rsp), %ip2
movq 16(%rsp), %ip3 movq 16(%rsp), %ip3
# Re-compute olimit /* Re-compute olimit */
jmp .L_4X1_compute_olimit jmp .L_4X1_compute_olimit
#undef GET_NEXT_DELT #undef GET_NEXT_DELT
#undef DECODE_FROM_DELT #undef DECODE_FROM_DELT
#undef DECODE #undef DECODE
#undef RELOAD_BITS #undef RELOAD_BITS
# LLVM-MCA-END
.L_4X1_exit: .L_4X1_exit:
addq $24, %rsp addq $24, %rsp
# Restore stack (oend & olimit) /* Restore stack (oend & olimit) */
pop %rax # olimit pop %rax /* olimit */
pop %rax # oend pop %rax /* oend */
pop %rax # ilimit pop %rax /* ilimit */
pop %rax # arg pop %rax /* arg */
# Save ip / op / bits /* Save ip / op / bits */
movq %ip0, 0(%rax) movq %ip0, 0(%rax)
movq %ip1, 8(%rax) movq %ip1, 8(%rax)
movq %ip2, 16(%rax) movq %ip2, 16(%rax)
@ -307,7 +318,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
movq %bits2, 80(%rax) movq %bits2, 80(%rax)
movq %bits3, 88(%rax) movq %bits3, 88(%rax)
# Restore registers /* Restore registers */
pop %r15 pop %r15
pop %r14 pop %r14
pop %r13 pop %r13
@ -327,7 +338,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
# Save all registers - even if they are callee saved for simplicity. /* Save all registers - even if they are callee saved for simplicity. */
push %rax push %rax
push %rbx push %rbx
push %rcx push %rcx
@ -358,39 +369,40 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
movq 80(%rax), %bits2 movq 80(%rax), %bits2
movq 88(%rax), %bits3 movq 88(%rax), %bits3
movq 96(%rax), %dtable movq 96(%rax), %dtable
push %rax # argument push %rax /* argument */
push %rax # olimit push %rax /* olimit */
push 104(%rax) # ilimit push 104(%rax) /* ilimit */
movq 112(%rax), %rax movq 112(%rax), %rax
push %rax # oend3 push %rax /* oend3 */
movq %op3, %rax movq %op3, %rax
push %rax # oend2 push %rax /* oend2 */
movq %op2, %rax movq %op2, %rax
push %rax # oend1 push %rax /* oend1 */
movq %op1, %rax movq %op1, %rax
push %rax # oend0 push %rax /* oend0 */
# Scratch space /* Scratch space */
subq $8, %rsp subq $8, %rsp
.L_4X2_compute_olimit: .L_4X2_compute_olimit:
# Computes how many iterations we can do savely /* Computes how many iterations we can do savely
# %r15, %rax may be clobbered * %r15, %rax may be clobbered
# rdx must be saved * rdx must be saved
# op[1,2,3,4] & ip0 mustn't be clobbered * op[1,2,3,4] & ip0 mustn't be clobbered
*/
movq %rdx, 0(%rsp) movq %rdx, 0(%rsp)
# We can consume up to 7 input bytes each iteration. /* We can consume up to 7 input bytes each iteration. */
movq %ip0, %rax # rax = ip0 movq %ip0, %rax /* rax = ip0 */
movq 40(%rsp), %rdx # rdx = ilimit movq 40(%rsp), %rdx /* rdx = ilimit */
subq %rdx, %rax # rax = ip0 - ilimit subq %rdx, %rax /* rax = ip0 - ilimit */
movq %rax, %r15 # r15 = ip0 - ilimit movq %rax, %r15 /* r15 = ip0 - ilimit */
# rdx = rax / 7 /* rdx = rax / 7 */
movabsq $2635249153387078803, %rdx movabsq $2635249153387078803, %rdx
mulq %rdx mulq %rdx
subq %rdx, %r15 subq %rdx, %r15
@ -398,71 +410,71 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
addq %r15, %rdx addq %r15, %rdx
shrq $2, %rdx shrq $2, %rdx
# r15 = (ip0 - ilimit) / 7 /* r15 = (ip0 - ilimit) / 7 */
movq %rdx, %r15 movq %rdx, %r15
movabsq $-3689348814741910323, %rdx movabsq $-3689348814741910323, %rdx
movq 8(%rsp), %rax # rax = oend0 movq 8(%rsp), %rax /* rax = oend0 */
subq %op0, %rax # rax = oend0 - op0 subq %op0, %rax /* rax = oend0 - op0 */
mulq %rdx mulq %rdx
shrq $3, %rdx # rdx = rax / 10 shrq $3, %rdx /* rdx = rax / 10 */
# r15 = min(%rdx, %r15) /* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15 cmpq %rdx, %r15
cmova %rdx, %r15 cmova %rdx, %r15
movabsq $-3689348814741910323, %rdx movabsq $-3689348814741910323, %rdx
movq 16(%rsp), %rax # rax = oend1 movq 16(%rsp), %rax /* rax = oend1 */
subq %op1, %rax # rax = oend1 - op1 subq %op1, %rax /* rax = oend1 - op1 */
mulq %rdx mulq %rdx
shrq $3, %rdx # rdx = rax / 10 shrq $3, %rdx /* rdx = rax / 10 */
# r15 = min(%rdx, %r15) /* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15 cmpq %rdx, %r15
cmova %rdx, %r15 cmova %rdx, %r15
movabsq $-3689348814741910323, %rdx movabsq $-3689348814741910323, %rdx
movq 24(%rsp), %rax # rax = oend2 movq 24(%rsp), %rax /* rax = oend2 */
subq %op2, %rax # rax = oend2 - op2 subq %op2, %rax /* rax = oend2 - op2 */
mulq %rdx mulq %rdx
shrq $3, %rdx # rdx = rax / 10 shrq $3, %rdx /* rdx = rax / 10 */
# r15 = min(%rdx, %r15) /* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15 cmpq %rdx, %r15
cmova %rdx, %r15 cmova %rdx, %r15
movabsq $-3689348814741910323, %rdx movabsq $-3689348814741910323, %rdx
movq 32(%rsp), %rax # rax = oend3 movq 32(%rsp), %rax /* rax = oend3 */
subq %op3, %rax # rax = oend3 - op3 subq %op3, %rax /* rax = oend3 - op3 */
mulq %rdx mulq %rdx
shrq $3, %rdx # rdx = rax / 10 shrq $3, %rdx /* rdx = rax / 10 */
# r15 = min(%rdx, %r15) /* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15 cmpq %rdx, %r15
cmova %rdx, %r15 cmova %rdx, %r15
# olimit = op3 + 5 * r15 /* olimit = op3 + 5 * r15 */
movq %r15, %rax movq %r15, %rax
leaq (%op3, %rax, 4), %olimit leaq (%op3, %rax, 4), %olimit
addq %rax, %olimit addq %rax, %olimit
movq 0(%rsp), %rdx movq 0(%rsp), %rdx
# If (op3 + 10 > olimit) /* If (op3 + 10 > olimit) */
movq %op3, %rax # rax = op3 movq %op3, %rax /* rax = op3 */
addq $10, %rax # rax = op3 + 10 addq $10, %rax /* rax = op3 + 10 */
cmpq %rax, %olimit # op3 + 10 > olimit cmpq %rax, %olimit /* op3 + 10 > olimit */
jb .L_4X2_exit jb .L_4X2_exit
# If (ip1 < ip0) go to exit /* If (ip1 < ip0) go to exit */
cmpq %ip0, %ip1 cmpq %ip0, %ip1
jb .L_4X2_exit jb .L_4X2_exit
# If (ip2 < ip1) go to exit /* If (ip2 < ip1) go to exit */
cmpq %ip1, %ip2 cmpq %ip1, %ip2
jb .L_4X2_exit jb .L_4X2_exit
# If (ip3 < ip2) go to exit /* If (ip3 < ip2) go to exit */
cmpq %ip2, %ip3 cmpq %ip2, %ip3
jb .L_4X2_exit jb .L_4X2_exit
@ -484,7 +496,7 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
subq %bits##n, %ip##n; \ subq %bits##n, %ip##n; \
movq (%ip##n), %bits##n; \ movq (%ip##n), %bits##n; \
orq $1, %bits##n; \ orq $1, %bits##n; \
shlxq %rax, %bits##n, %bits##n; shlxq %rax, %bits##n, %bits##n
movq %olimit, 48(%rsp) movq %olimit, 48(%rsp)
@ -492,19 +504,17 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
.p2align 6 .p2align 6
.L_4X2_loop_body: .L_4X2_loop_body:
# LLVM-MCA-BEGIN decode-4X2 /* We clobber r8, so store it on the stack */
# We clobber r8, so store it on the stack
movq %r8, 0(%rsp) movq %r8, 0(%rsp)
# Decode 5 symbols from each of the 4 streams (20 symbols total). /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
# Reload r8 /* Reload r8 */
movq 0(%rsp), %r8 movq 0(%rsp), %r8
FOR_EACH_STREAM(RELOAD_BITS) FOR_EACH_STREAM(RELOAD_BITS)
@ -515,19 +525,18 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
#undef DECODE #undef DECODE
#undef RELOAD_BITS #undef RELOAD_BITS
# LLVM-MCA-END
.L_4X2_exit: .L_4X2_exit:
addq $8, %rsp addq $8, %rsp
# Restore stack (oend & olimit) /* Restore stack (oend & olimit) */
pop %rax # oend0 pop %rax /* oend0 */
pop %rax # oend1 pop %rax /* oend1 */
pop %rax # oend2 pop %rax /* oend2 */
pop %rax # oend3 pop %rax /* oend3 */
pop %rax # ilimit pop %rax /* ilimit */
pop %rax # olimit pop %rax /* olimit */
pop %rax # arg pop %rax /* arg */
# Save ip / op / bits /* Save ip / op / bits */
movq %ip0, 0(%rax) movq %ip0, 0(%rax)
movq %ip1, 8(%rax) movq %ip1, 8(%rax)
movq %ip2, 16(%rax) movq %ip2, 16(%rax)
@ -541,7 +550,7 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
movq %bits2, 80(%rax) movq %bits2, 80(%rax)
movq %bits3, 88(%rax) movq %bits3, 88(%rax)
# Restore registers /* Restore registers */
pop %r15 pop %r15
pop %r14 pop %r14
pop %r13 pop %r13
@ -558,4 +567,5 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
pop %rbx pop %rbx
pop %rax pop %rax
ret ret
#endif #endif