Fix performance regression on aarch64 with clang

This commit is contained in:
Nick Terrell 2020-01-23 16:18:52 -08:00
parent 2f31050a3f
commit cb2abc3dbe
2 changed files with 7 additions and 6 deletions

View File

@ -817,7 +817,7 @@ HUF_decompress4X2_usingDTable_internal_body(
/* 16-32 symbols per loop (4-8 symbols per stream) */
for ( ; (endSignal) & (op4 < olimit); ) {
#ifdef __clang__
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
@ -855,10 +855,11 @@ HUF_decompress4X2_usingDTable_internal_body(
HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
endSignal = LIKELY(
(BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
& (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
& (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
& (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
#endif
}

View File

@ -580,7 +580,7 @@ typedef struct {
* Precondition: *ip <= *op
* Postcondition: *op - *op >= 8
*/
static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
assert(*ip <= *op);
if (offset < 8) {
/* close range match, overlap */