Widen ZSTD_wildcopy to 32 bytes

This commit is contained in:
Nick Terrell 2019-09-20 00:52:15 -07:00
parent efd37a64ea
commit cdad7fa512
2 changed files with 18 additions and 16 deletions

View File

@ -197,7 +197,7 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); } static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
#define WILDCOPY_OVERLENGTH 16 #define WILDCOPY_OVERLENGTH 32
#define WILDCOPY_VECLEN 16 #define WILDCOPY_VECLEN 16
typedef enum { typedef enum {
@ -237,11 +237,11 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. * On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%.
*/ */
COPY16(op, ip); COPY16(op, ip);
if (op >= oend) return;
COPY16(op, ip); COPY16(op, ip);
if (op >= oend) return; if (op >= oend) return;
do { do {
COPY16(op, ip); COPY16(op, ip);
COPY16(op, ip);
} }
while (op < oend); while (op < oend);
} }
@ -257,7 +257,7 @@ MEM_STATIC void ZSTD_wildcopy8(void* dst, const void* src, ptrdiff_t length)
BYTE* op = (BYTE*)dst; BYTE* op = (BYTE*)dst;
BYTE* const oend = (BYTE*)op + length; BYTE* const oend = (BYTE*)op + length;
do { do {
COPY8(op, ip) COPY8(op, ip);
} while (op < oend); } while (op < oend);
} }

View File

@ -724,12 +724,14 @@ size_t ZSTD_execSequence(BYTE* op,
assert(oMatchEnd <= oend_w /* Can wildcopy matches */); assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals: /* Copy Literals:
* Split out litLength <= 16 since it is nearly always true. +1% on gcc-9. * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
* We likely don't need the full 32-byte wildcopy.
*/ */
if (sequence.litLength <= 16) assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, *litPtr); ZSTD_copy16(op, (*litPtr));
else if (sequence.litLength > 16) {
ZSTD_wildcopy(op, (*litPtr), sequence.litLength, ZSTD_no_overlap); ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
}
op = oLitEnd; op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */ *litPtr = iLitEnd; /* update for next sequence */
@ -755,18 +757,18 @@ size_t ZSTD_execSequence(BYTE* op,
assert(match >= prefixStart); assert(match >= prefixStart);
assert(sequence.matchLength >= 1); assert(sequence.matchLength >= 1);
/* Nearly all offsets are >= 16 bytes, which means we can use wildcopy /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
* without overlap checking. * without overlap checking.
*/ */
if (sequence.offset >= 16) { if (sequence.offset >= WILDCOPY_VECLEN) {
/* Split out matchLength <= 16 since it is nearly always true. +1% on gcc-9. */ /* Split out matchLength <= 32 since it is nearly always true. +1% on gcc-9.
if (sequence.matchLength <= 16) * We copy 32 bytes here since matches are generally longer than literals.
ZSTD_copy16(op, match); * In silesia, for example ~10% of matches are longer than 16 bytes.
else */
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
return sequenceLength; return sequenceLength;
} }
assert(sequence.offset < 16); assert(sequence.offset < WILDCOPY_VECLEN);
/* Copy 8 bytes and spread the offset to be >= 8. */ /* Copy 8 bytes and spread the offset to be >= 8. */
ZSTD_overlapCopy8(&op, &match, sequence.offset); ZSTD_overlapCopy8(&op, &match, sequence.offset);