diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S index 77a2d856..83e3d756 100644 --- a/lib/decompress/huf_decompress_amd64.S +++ b/lib/decompress/huf_decompress_amd64.S @@ -1,57 +1,61 @@ -# Calling convention: -# -# %rdi contains the first argument: HUF_DecompressAsmArgs*. -# %rbp is'nt maintained (no frame pointer). -# %rsp contains the stack pointer that grows down. -# No red-zone is assumed, only addresses >= %rsp are used. -# All register contents are preserved. -# -# TODO: Support Windows calling convention. - #if !defined(HUF_DISABLE_ASM) && defined(__x86_64__) + +/* Calling convention: + * + * %rdi contains the first argument: HUF_DecompressAsmArgs*. + * %rbp is'nt maintained (no frame pointer). + * %rsp contains the stack pointer that grows down. + * No red-zone is assumed, only addresses >= %rsp are used. + * All register contents are preserved. + * + * TODO: Support Windows calling convention. + */ + .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .text -# Sets up register mappings for clarity. -# op[], bits[], dtable & ip[0] each get their own register. -# ip[1,2,3] & olimit alias var[]. -# %rax is a scratch register. +/* Sets up register mappings for clarity. + * op[], bits[], dtable & ip[0] each get their own register. + * ip[1,2,3] & olimit alias var[]. + * %rax is a scratch register. + */ -#define op0 rsi -#define op1 rbx -#define op2 rcx -#define op3 rdi +#define op0 rsi +#define op1 rbx +#define op2 rcx +#define op3 rdi -#define ip0 r8 -#define ip1 r9 -#define ip2 r10 -#define ip3 r11 +#define ip0 r8 +#define ip1 r9 +#define ip2 r10 +#define ip3 r11 -#define bits0 rbp -#define bits1 rdx -#define bits2 r12 -#define bits3 r13 -#define dtable r14 -#define olimit r15 +#define bits0 rbp +#define bits1 rdx +#define bits2 r12 +#define bits3 r13 +#define dtable r14 +#define olimit r15 -# var[] aliases ip[1,2,3] & olimit -# ip[1,2,3] are saved every iteration. -# olimit is only used in compute_olimit. -#define var0 r15 -#define var1 r9 -#define var2 r10 -#define var3 r11 +/* var[] aliases ip[1,2,3] & olimit + * ip[1,2,3] are saved every iteration. + * olimit is only used in compute_olimit. + */ +#define var0 r15 +#define var1 r9 +#define var2 r10 +#define var3 r11 -# 32-bit var registers -#define vard0 r15d -#define vard1 r9d -#define vard2 r10d -#define vard3 r11d +/* 32-bit var registers */ +#define vard0 r15d +#define vard1 r9d +#define vard2 r10d +#define vard3 r11d -# Helper macro: args if idx != 4. +/* Helper macro: args if idx != 4. */ #define IF_NOT_4_0(...) __VA_ARGS__ #define IF_NOT_4_1(...) __VA_ARGS__ #define IF_NOT_4_2(...) __VA_ARGS__ @@ -60,502 +64,508 @@ #define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__) #define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__) -# Calls X(N) for each stream 0, 1, 2, 3. +/* Calls X(N) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM(X) \ - X(0); \ - X(1); \ - X(2); \ - X(3) + X(0); \ + X(1); \ + X(2); \ + X(3) -# Calls X(N, idx) for each stream 0, 1, 2, 3. +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ - X(0, idx); \ - X(1, idx); \ - X(2, idx); \ - X(3, idx) + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) -# Define both _HUF_* & HUF_* symbols because MacOS -# C symbols are prefixed with '_' & Linux symbols aren't. +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: - # Save all registers - even if they are callee saved for simplicity. - push %rax - push %rbx - push %rcx - push %rdx - push %rbp - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - push %r12 - push %r13 - push %r14 - push %r15 + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 - # Read HUF_DecompressAsmArgs* args from %rax - movq %rdi, %rax - movq 0(%rax), %ip0 - movq 8(%rax), %ip1 - movq 16(%rax), %ip2 - movq 24(%rax), %ip3 - movq 32(%rax), %op0 - movq 40(%rax), %op1 - movq 48(%rax), %op2 - movq 56(%rax), %op3 - movq 64(%rax), %bits0 - movq 72(%rax), %bits1 - movq 80(%rax), %bits2 - movq 88(%rax), %bits3 - movq 96(%rax), %dtable - push %rax # argument - push 104(%rax) # ilimit - push 112(%rax) # oend - push %olimit # olimit space + /* Read HUF_DecompressAsmArgs* args from %rax */ + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push 104(%rax) /* ilimit */ + push 112(%rax) /* oend */ + push %olimit /* olimit space */ - subq $24, %rsp + subq $24, %rsp .L_4X1_compute_olimit: - # Computes how many iterations we can do savely - # %r15, %rax may be clobbered - # rbx, rdx must be saved - # op3 & ip0 mustn't be clobbered - movq %rbx, 0(%rsp) - movq %rdx, 8(%rsp) + /* Computes how many iterations we can do savely + * %r15, %rax may be clobbered + * rbx, rdx must be saved + * op3 & ip0 mustn't be clobbered + */ + movq %rbx, 0(%rsp) + movq %rdx, 8(%rsp) - movq 32(%rsp), %rax # rax = oend - subq %op3, %rax # rax = oend - op3 + movq 32(%rsp), %rax /* rax = oend */ + subq %op3, %rax /* rax = oend - op3 */ - # r15 = (oend - op3) / 5 - movabsq $-3689348814741910323, %rdx - mulq %rdx - movq %rdx, %r15 - shrq $2, %r15 + /* r15 = (oend - op3) / 5 */ + movabsq $-3689348814741910323, %rdx + mulq %rdx + movq %rdx, %r15 + shrq $2, %r15 - movq %ip0, %rax # rax = ip0 - movq 40(%rsp), %rdx # rdx = ilimit - subq %rdx, %rax # rax = ip0 - ilimit - movq %rax, %rbx # rbx = ip0 - ilimit + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %rbx /* rbx = ip0 - ilimit */ - # rdx = (ip0 - ilimit) / 7 - movabsq $2635249153387078803, %rdx - mulq %rdx - subq %rdx, %rbx - shrq %rbx - addq %rbx, %rdx - shrq $2, %rdx + /* rdx = (ip0 - ilimit) / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %rbx + shrq %rbx + addq %rbx, %rdx + shrq $2, %rdx - # r15 = min(%rdx, %r15) - cmpq %rdx, %r15 - cmova %rdx, %r15 + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 - # r15 = r15 * 5 - leaq (%r15, %r15, 4), %r15 + /* r15 = r15 * 5 */ + leaq (%r15, %r15, 4), %r15 - # olimit = op3 + r15 - addq %op3, %olimit + /* olimit = op3 + r15 */ + addq %op3, %olimit - movq 8(%rsp), %rdx - movq 0(%rsp), %rbx + movq 8(%rsp), %rdx + movq 0(%rsp), %rbx - # If (op3 + 20 > olimit) - movq %op3, %rax # rax = op3 - addq $20, %rax # rax = op3 + 20 - cmpq %rax, %olimit # op3 + 20 > olimit - jb .L_4X1_exit + /* If (op3 + 20 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $20, %rax /* rax = op3 + 20 */ + cmpq %rax, %olimit /* op3 + 20 > olimit */ + jb .L_4X1_exit - # If (ip1 < ip0) go to exit - cmpq %ip0, %ip1 - jb .L_4X1_exit + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X1_exit - # If (ip2 < ip1) go to exit - cmpq %ip1, %ip2 - jb .L_4X1_exit + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X1_exit - # If (ip3 < ip2) go to exit - cmpq %ip2, %ip3 - jb .L_4X1_exit + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X1_exit -# Reads top 11 bits from bits[n] -# Loads dt[bits[n]] into var[n] -#define GET_NEXT_DELT(n) \ - movq $53, %var##n; \ - shrxq %var##n, %bits##n, %var##n; \ - movzwl (%dtable,%var##n,2),%vard##n +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n) \ + movq $53, %var##n; \ + shrxq %var##n, %bits##n, %var##n; \ + movzwl (%dtable,%var##n,2),%vard##n -# var[n] must contain the DTable entry computed with GET_NEXT_DELT -# Moves var[n] to %rax -# bits[n] <<= var[n] & 63 -# op[n][idx] = %rax >> 8 -# %ah is a way to access bits [8, 16) of %rax -#define DECODE_FROM_DELT(n, idx) \ - movq %var##n, %rax; \ - shlxq %var##n, %bits##n, %bits##n; \ - movb %ah, idx(%op##n) +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx) \ + movq %var##n, %rax; \ + shlxq %var##n, %bits##n, %bits##n; \ + movb %ah, idx(%op##n) -# Assumes GET_NEXT_DELT has been called. -# Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4 -#define DECODE(n, idx) \ - DECODE_FROM_DELT(n, idx); \ - IF_NOT_4(idx, GET_NEXT_DELT(n)) +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4 + */ +#define DECODE(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + IF_NOT_4(idx, GET_NEXT_DELT(n)) -# // ctz & nbBytes is stored in bits[n] -# // nbBits is stored in %rax -# ctz = CTZ[bits[n]] -# nbBits = ctz & 7 -# nbBytes = ctz >> 3 -# op[n] += 5 -# ip[n] -= nbBytes -# // Note: x86-64 is little-endian ==> no bswap -# bits[n] = MEM_readST(ip[n]) | 1 -# bits[n] <<= nbBits -#define RELOAD_BITS(n) \ - bsfq %bits##n, %bits##n; \ - movq %bits##n, %rax; \ - andq $7, %rax; \ - shrq $3, %bits##n; \ - leaq 5(%op##n), %op##n; \ - subq %bits##n, %ip##n; \ - movq (%ip##n), %bits##n; \ - orq $1, %bits##n; \ - shlx %rax, %bits##n, %bits##n; +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz = CTZ[bits[n]] + * nbBits = ctz & 7 + * nbBytes = ctz >> 3 + * op[n] += 5 + * ip[n] -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + andq $7, %rax; \ + shrq $3, %bits##n; \ + leaq 5(%op##n), %op##n; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlx %rax, %bits##n, %bits##n - # Store clobbered variables on the stack - movq %olimit, 24(%rsp) - movq %ip1, 0(%rsp) - movq %ip2, 8(%rsp) - movq %ip3, 16(%rsp) + /* Store clobbered variables on the stack */ + movq %olimit, 24(%rsp) + movq %ip1, 0(%rsp) + movq %ip2, 8(%rsp) + movq %ip3, 16(%rsp) - # Call GET_NEXT_DELT for each stream - FOR_EACH_STREAM(GET_NEXT_DELT) + /* Call GET_NEXT_DELT for each stream */ + FOR_EACH_STREAM(GET_NEXT_DELT) - .p2align 6 + .p2align 6 .L_4X1_loop_body: -# LLVM-MCA-BEGIN decode-4X1 - # Decode 5 symbols in each of the 4 streams (20 total) - # Must have called GET_NEXT_DELT for each stream - FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + /* Decode 5 symbols in each of the 4 streams (20 total) + * Must have called GET_NEXT_DELT for each stream + */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) - # Load ip[1,2,3] from stack (var[] aliases them) - # ip[] is needed for RELOAD_BITS - # Each will be stored back to the stack after RELOAD - movq 0(%rsp), %ip1 - movq 8(%rsp), %ip2 - movq 16(%rsp), %ip3 + /* Load ip[1,2,3] from stack (var[] aliases them) + * ip[] is needed for RELOAD_BITS + * Each will be stored back to the stack after RELOAD + */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 - # Reload each stream & fetch the next table entry - # to prepare for the next iteration - RELOAD_BITS(0) - GET_NEXT_DELT(0) + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + RELOAD_BITS(0) + GET_NEXT_DELT(0) - RELOAD_BITS(1) - movq %ip1, 0(%rsp) - GET_NEXT_DELT(1) + RELOAD_BITS(1) + movq %ip1, 0(%rsp) + GET_NEXT_DELT(1) - RELOAD_BITS(2) - movq %ip2, 8(%rsp) - GET_NEXT_DELT(2) + RELOAD_BITS(2) + movq %ip2, 8(%rsp) + GET_NEXT_DELT(2) - RELOAD_BITS(3) - movq %ip3, 16(%rsp) - GET_NEXT_DELT(3) + RELOAD_BITS(3) + movq %ip3, 16(%rsp) + GET_NEXT_DELT(3) - # If op3 < olimit: continue the loop - cmp %op3, 24(%rsp) - ja .L_4X1_loop_body + /* If op3 < olimit: continue the loop */ + cmp %op3, 24(%rsp) + ja .L_4X1_loop_body - # Reload ip[1,2,3] from stack - movq 0(%rsp), %ip1 - movq 8(%rsp), %ip2 - movq 16(%rsp), %ip3 + /* Reload ip[1,2,3] from stack */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 - # Re-compute olimit - jmp .L_4X1_compute_olimit + /* Re-compute olimit */ + jmp .L_4X1_compute_olimit #undef GET_NEXT_DELT #undef DECODE_FROM_DELT #undef DECODE #undef RELOAD_BITS -# LLVM-MCA-END .L_4X1_exit: - addq $24, %rsp + addq $24, %rsp - # Restore stack (oend & olimit) - pop %rax # olimit - pop %rax # oend - pop %rax # ilimit - pop %rax # arg + /* Restore stack (oend & olimit) */ + pop %rax /* olimit */ + pop %rax /* oend */ + pop %rax /* ilimit */ + pop %rax /* arg */ - # Save ip / op / bits - movq %ip0, 0(%rax) - movq %ip1, 8(%rax) - movq %ip2, 16(%rax) - movq %ip3, 24(%rax) - movq %op0, 32(%rax) - movq %op1, 40(%rax) - movq %op2, 48(%rax) - movq %op3, 56(%rax) - movq %bits0, 64(%rax) - movq %bits1, 72(%rax) - movq %bits2, 80(%rax) - movq %bits3, 88(%rax) + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) - # Restore registers - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rbp - pop %rdx - pop %rcx - pop %rbx - pop %rax - ret + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: - # Save all registers - even if they are callee saved for simplicity. - push %rax - push %rbx - push %rcx - push %rdx - push %rbp - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - push %r12 - push %r13 - push %r14 - push %r15 + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 - movq %rdi, %rax - movq 0(%rax), %ip0 - movq 8(%rax), %ip1 - movq 16(%rax), %ip2 - movq 24(%rax), %ip3 - movq 32(%rax), %op0 - movq 40(%rax), %op1 - movq 48(%rax), %op2 - movq 56(%rax), %op3 - movq 64(%rax), %bits0 - movq 72(%rax), %bits1 - movq 80(%rax), %bits2 - movq 88(%rax), %bits3 - movq 96(%rax), %dtable - push %rax # argument - push %rax # olimit - push 104(%rax) # ilimit + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push %rax /* olimit */ + push 104(%rax) /* ilimit */ - movq 112(%rax), %rax - push %rax # oend3 + movq 112(%rax), %rax + push %rax /* oend3 */ - movq %op3, %rax - push %rax # oend2 + movq %op3, %rax + push %rax /* oend2 */ - movq %op2, %rax - push %rax # oend1 + movq %op2, %rax + push %rax /* oend1 */ - movq %op1, %rax - push %rax # oend0 + movq %op1, %rax + push %rax /* oend0 */ - # Scratch space - subq $8, %rsp + /* Scratch space */ + subq $8, %rsp .L_4X2_compute_olimit: - # Computes how many iterations we can do savely - # %r15, %rax may be clobbered - # rdx must be saved - # op[1,2,3,4] & ip0 mustn't be clobbered - movq %rdx, 0(%rsp) + /* Computes how many iterations we can do savely + * %r15, %rax may be clobbered + * rdx must be saved + * op[1,2,3,4] & ip0 mustn't be clobbered + */ + movq %rdx, 0(%rsp) - # We can consume up to 7 input bytes each iteration. - movq %ip0, %rax # rax = ip0 - movq 40(%rsp), %rdx # rdx = ilimit - subq %rdx, %rax # rax = ip0 - ilimit - movq %rax, %r15 # r15 = ip0 - ilimit + /* We can consume up to 7 input bytes each iteration. */ + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %r15 /* r15 = ip0 - ilimit */ - # rdx = rax / 7 - movabsq $2635249153387078803, %rdx - mulq %rdx - subq %rdx, %r15 - shrq %r15 - addq %r15, %rdx - shrq $2, %rdx + /* rdx = rax / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %r15 + shrq %r15 + addq %r15, %rdx + shrq $2, %rdx - # r15 = (ip0 - ilimit) / 7 - movq %rdx, %r15 + /* r15 = (ip0 - ilimit) / 7 */ + movq %rdx, %r15 - movabsq $-3689348814741910323, %rdx - movq 8(%rsp), %rax # rax = oend0 - subq %op0, %rax # rax = oend0 - op0 - mulq %rdx - shrq $3, %rdx # rdx = rax / 10 + movabsq $-3689348814741910323, %rdx + movq 8(%rsp), %rax /* rax = oend0 */ + subq %op0, %rax /* rax = oend0 - op0 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ - # r15 = min(%rdx, %r15) - cmpq %rdx, %r15 - cmova %rdx, %r15 + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 - movabsq $-3689348814741910323, %rdx - movq 16(%rsp), %rax # rax = oend1 - subq %op1, %rax # rax = oend1 - op1 - mulq %rdx - shrq $3, %rdx # rdx = rax / 10 + movabsq $-3689348814741910323, %rdx + movq 16(%rsp), %rax /* rax = oend1 */ + subq %op1, %rax /* rax = oend1 - op1 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ - # r15 = min(%rdx, %r15) - cmpq %rdx, %r15 - cmova %rdx, %r15 + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 - movabsq $-3689348814741910323, %rdx - movq 24(%rsp), %rax # rax = oend2 - subq %op2, %rax # rax = oend2 - op2 - mulq %rdx - shrq $3, %rdx # rdx = rax / 10 + movabsq $-3689348814741910323, %rdx + movq 24(%rsp), %rax /* rax = oend2 */ + subq %op2, %rax /* rax = oend2 - op2 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ - # r15 = min(%rdx, %r15) - cmpq %rdx, %r15 - cmova %rdx, %r15 + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 - movabsq $-3689348814741910323, %rdx - movq 32(%rsp), %rax # rax = oend3 - subq %op3, %rax # rax = oend3 - op3 - mulq %rdx - shrq $3, %rdx # rdx = rax / 10 + movabsq $-3689348814741910323, %rdx + movq 32(%rsp), %rax /* rax = oend3 */ + subq %op3, %rax /* rax = oend3 - op3 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ - # r15 = min(%rdx, %r15) - cmpq %rdx, %r15 - cmova %rdx, %r15 + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 - # olimit = op3 + 5 * r15 - movq %r15, %rax - leaq (%op3, %rax, 4), %olimit - addq %rax, %olimit + /* olimit = op3 + 5 * r15 */ + movq %r15, %rax + leaq (%op3, %rax, 4), %olimit + addq %rax, %olimit - movq 0(%rsp), %rdx + movq 0(%rsp), %rdx - # If (op3 + 10 > olimit) - movq %op3, %rax # rax = op3 - addq $10, %rax # rax = op3 + 10 - cmpq %rax, %olimit # op3 + 10 > olimit - jb .L_4X2_exit + /* If (op3 + 10 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $10, %rax /* rax = op3 + 10 */ + cmpq %rax, %olimit /* op3 + 10 > olimit */ + jb .L_4X2_exit - # If (ip1 < ip0) go to exit - cmpq %ip0, %ip1 - jb .L_4X2_exit + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X2_exit - # If (ip2 < ip1) go to exit - cmpq %ip1, %ip2 - jb .L_4X2_exit + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X2_exit - # If (ip3 < ip2) go to exit - cmpq %ip2, %ip3 - jb .L_4X2_exit + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X2_exit -#define DECODE(n, idx) \ - movq %bits##n, %rax; \ - shrq $53, %rax; \ - movzwl 0(%dtable,%rax,4),%r8d; \ - movzbl 2(%dtable,%rax,4),%r15d; \ - movzbl 3(%dtable,%rax,4),%eax; \ - movw %r8w, (%op##n); \ - shlxq %r15, %bits##n, %bits##n; \ - addq %rax, %op##n +#define DECODE(n, idx) \ + movq %bits##n, %rax; \ + shrq $53, %rax; \ + movzwl 0(%dtable,%rax,4),%r8d; \ + movzbl 2(%dtable,%rax,4),%r15d; \ + movzbl 3(%dtable,%rax,4),%eax; \ + movw %r8w, (%op##n); \ + shlxq %r15, %bits##n, %bits##n; \ + addq %rax, %op##n -#define RELOAD_BITS(n) \ - bsfq %bits##n, %bits##n; \ - movq %bits##n, %rax; \ - shrq $3, %bits##n; \ - andq $7, %rax; \ - subq %bits##n, %ip##n; \ - movq (%ip##n), %bits##n; \ - orq $1, %bits##n; \ - shlxq %rax, %bits##n, %bits##n; +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + shrq $3, %bits##n; \ + andq $7, %rax; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlxq %rax, %bits##n, %bits##n - movq %olimit, 48(%rsp) + movq %olimit, 48(%rsp) - .p2align 6 + .p2align 6 .L_4X2_loop_body: -# LLVM-MCA-BEGIN decode-4X2 + /* We clobber r8, so store it on the stack */ + movq %r8, 0(%rsp) - # We clobber r8, so store it on the stack - movq %r8, 0(%rsp) + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) - # Decode 5 symbols from each of the 4 streams (20 symbols total). - FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) - FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + /* Reload r8 */ + movq 0(%rsp), %r8 - # Reload r8 - movq 0(%rsp), %r8 + FOR_EACH_STREAM(RELOAD_BITS) - FOR_EACH_STREAM(RELOAD_BITS) - - cmp %op3, 48(%rsp) - ja .L_4X2_loop_body - jmp .L_4X2_compute_olimit + cmp %op3, 48(%rsp) + ja .L_4X2_loop_body + jmp .L_4X2_compute_olimit #undef DECODE #undef RELOAD_BITS -# LLVM-MCA-END .L_4X2_exit: - addq $8, %rsp - # Restore stack (oend & olimit) - pop %rax # oend0 - pop %rax # oend1 - pop %rax # oend2 - pop %rax # oend3 - pop %rax # ilimit - pop %rax # olimit - pop %rax # arg + addq $8, %rsp + /* Restore stack (oend & olimit) */ + pop %rax /* oend0 */ + pop %rax /* oend1 */ + pop %rax /* oend2 */ + pop %rax /* oend3 */ + pop %rax /* ilimit */ + pop %rax /* olimit */ + pop %rax /* arg */ - # Save ip / op / bits - movq %ip0, 0(%rax) - movq %ip1, 8(%rax) - movq %ip2, 16(%rax) - movq %ip3, 24(%rax) - movq %op0, 32(%rax) - movq %op1, 40(%rax) - movq %op2, 48(%rax) - movq %op3, 56(%rax) - movq %bits0, 64(%rax) - movq %bits1, 72(%rax) - movq %bits2, 80(%rax) - movq %bits3, 88(%rax) + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret - # Restore registers - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rbp - pop %rdx - pop %rcx - pop %rbx - pop %rax - ret #endif