Widen ZSTD_wildcopy to 32 bytes
This commit is contained in:
parent
efd37a64ea
commit
cdad7fa512
@ -197,7 +197,7 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
|
|||||||
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
||||||
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
||||||
|
|
||||||
#define WILDCOPY_OVERLENGTH 16
|
#define WILDCOPY_OVERLENGTH 32
|
||||||
#define WILDCOPY_VECLEN 16
|
#define WILDCOPY_VECLEN 16
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
@ -237,11 +237,11 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
|||||||
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%.
|
* On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%.
|
||||||
*/
|
*/
|
||||||
COPY16(op, ip);
|
COPY16(op, ip);
|
||||||
if (op >= oend) return;
|
|
||||||
COPY16(op, ip);
|
COPY16(op, ip);
|
||||||
if (op >= oend) return;
|
if (op >= oend) return;
|
||||||
do {
|
do {
|
||||||
COPY16(op, ip);
|
COPY16(op, ip);
|
||||||
|
COPY16(op, ip);
|
||||||
}
|
}
|
||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
@ -257,7 +257,7 @@ MEM_STATIC void ZSTD_wildcopy8(void* dst, const void* src, ptrdiff_t length)
|
|||||||
BYTE* op = (BYTE*)dst;
|
BYTE* op = (BYTE*)dst;
|
||||||
BYTE* const oend = (BYTE*)op + length;
|
BYTE* const oend = (BYTE*)op + length;
|
||||||
do {
|
do {
|
||||||
COPY8(op, ip)
|
COPY8(op, ip);
|
||||||
} while (op < oend);
|
} while (op < oend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -724,12 +724,14 @@ size_t ZSTD_execSequence(BYTE* op,
|
|||||||
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
|
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
|
||||||
|
|
||||||
/* Copy Literals:
|
/* Copy Literals:
|
||||||
* Split out litLength <= 16 since it is nearly always true. +1% on gcc-9.
|
* Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
|
||||||
|
* We likely don't need the full 32-byte wildcopy.
|
||||||
*/
|
*/
|
||||||
if (sequence.litLength <= 16)
|
assert(WILDCOPY_OVERLENGTH >= 16);
|
||||||
ZSTD_copy16(op, *litPtr);
|
ZSTD_copy16(op, (*litPtr));
|
||||||
else
|
if (sequence.litLength > 16) {
|
||||||
ZSTD_wildcopy(op, (*litPtr), sequence.litLength, ZSTD_no_overlap);
|
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
|
||||||
|
}
|
||||||
op = oLitEnd;
|
op = oLitEnd;
|
||||||
*litPtr = iLitEnd; /* update for next sequence */
|
*litPtr = iLitEnd; /* update for next sequence */
|
||||||
|
|
||||||
@ -755,18 +757,18 @@ size_t ZSTD_execSequence(BYTE* op,
|
|||||||
assert(match >= prefixStart);
|
assert(match >= prefixStart);
|
||||||
assert(sequence.matchLength >= 1);
|
assert(sequence.matchLength >= 1);
|
||||||
|
|
||||||
/* Nearly all offsets are >= 16 bytes, which means we can use wildcopy
|
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
|
||||||
* without overlap checking.
|
* without overlap checking.
|
||||||
*/
|
*/
|
||||||
if (sequence.offset >= 16) {
|
if (sequence.offset >= WILDCOPY_VECLEN) {
|
||||||
/* Split out matchLength <= 16 since it is nearly always true. +1% on gcc-9. */
|
/* Split out matchLength <= 32 since it is nearly always true. +1% on gcc-9.
|
||||||
if (sequence.matchLength <= 16)
|
* We copy 32 bytes here since matches are generally longer than literals.
|
||||||
ZSTD_copy16(op, match);
|
* In silesia, for example ~10% of matches are longer than 16 bytes.
|
||||||
else
|
*/
|
||||||
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
|
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
|
||||||
return sequenceLength;
|
return sequenceLength;
|
||||||
}
|
}
|
||||||
assert(sequence.offset < 16);
|
assert(sequence.offset < WILDCOPY_VECLEN);
|
||||||
|
|
||||||
/* Copy 8 bytes and spread the offset to be >= 8. */
|
/* Copy 8 bytes and spread the offset to be >= 8. */
|
||||||
ZSTD_overlapCopy8(&op, &match, sequence.offset);
|
ZSTD_overlapCopy8(&op, &match, sequence.offset);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user