Widen ZSTD_wildcopy to 32 bytes
parent
efd37a64ea
commit
cdad7fa512
|
@ -197,7 +197,7 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
|
|||
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
||||
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
||||
|
||||
#define WILDCOPY_OVERLENGTH 16
|
||||
#define WILDCOPY_OVERLENGTH 32
|
||||
#define WILDCOPY_VECLEN 16
|
||||
|
||||
typedef enum {
|
||||
|
@ -237,11 +237,11 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
|||
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%.
|
||||
*/
|
||||
COPY16(op, ip);
|
||||
if (op >= oend) return;
|
||||
COPY16(op, ip);
|
||||
if (op >= oend) return;
|
||||
do {
|
||||
COPY16(op, ip);
|
||||
COPY16(op, ip);
|
||||
}
|
||||
while (op < oend);
|
||||
}
|
||||
|
@ -257,7 +257,7 @@ MEM_STATIC void ZSTD_wildcopy8(void* dst, const void* src, ptrdiff_t length)
|
|||
BYTE* op = (BYTE*)dst;
|
||||
BYTE* const oend = (BYTE*)op + length;
|
||||
do {
|
||||
COPY8(op, ip)
|
||||
COPY8(op, ip);
|
||||
} while (op < oend);
|
||||
}
|
||||
|
||||
|
|
|
@ -724,12 +724,14 @@ size_t ZSTD_execSequence(BYTE* op,
|
|||
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
|
||||
|
||||
/* Copy Literals:
|
||||
* Split out litLength <= 16 since it is nearly always true. +1% on gcc-9.
|
||||
* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
|
||||
* We likely don't need the full 32-byte wildcopy.
|
||||
*/
|
||||
if (sequence.litLength <= 16)
|
||||
ZSTD_copy16(op, *litPtr);
|
||||
else
|
||||
ZSTD_wildcopy(op, (*litPtr), sequence.litLength, ZSTD_no_overlap);
|
||||
assert(WILDCOPY_OVERLENGTH >= 16);
|
||||
ZSTD_copy16(op, (*litPtr));
|
||||
if (sequence.litLength > 16) {
|
||||
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
|
||||
}
|
||||
op = oLitEnd;
|
||||
*litPtr = iLitEnd; /* update for next sequence */
|
||||
|
||||
|
@ -755,18 +757,18 @@ size_t ZSTD_execSequence(BYTE* op,
|
|||
assert(match >= prefixStart);
|
||||
assert(sequence.matchLength >= 1);
|
||||
|
||||
/* Nearly all offsets are >= 16 bytes, which means we can use wildcopy
|
||||
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
|
||||
* without overlap checking.
|
||||
*/
|
||||
if (sequence.offset >= 16) {
|
||||
/* Split out matchLength <= 16 since it is nearly always true. +1% on gcc-9. */
|
||||
if (sequence.matchLength <= 16)
|
||||
ZSTD_copy16(op, match);
|
||||
else
|
||||
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
|
||||
if (sequence.offset >= WILDCOPY_VECLEN) {
|
||||
/* Split out matchLength <= 32 since it is nearly always true. +1% on gcc-9.
|
||||
* We copy 32 bytes here since matches are generally longer than literals.
|
||||
* In silesia, for example ~10% of matches are longer than 16 bytes.
|
||||
*/
|
||||
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
|
||||
return sequenceLength;
|
||||
}
|
||||
assert(sequence.offset < 16);
|
||||
assert(sequence.offset < WILDCOPY_VECLEN);
|
||||
|
||||
/* Copy 8 bytes and spread the offset to be >= 8. */
|
||||
ZSTD_overlapCopy8(&op, &match, sequence.offset);
|
||||
|
|
Loading…
Reference in New Issue