Improvements in zstd decode performance
Summary: The idea behind wildcopy is that it can be cheaper to copy more bytes (say 8) than it is to copy less (say, 3). This change takes that further by exploiting some properties: 1. it's almost always OK to copy 16 bytes instead of 8, which means fewer copy instructions, and fewer branches 2. A 16 byte chunk size means that ~90% of wildcopy invocations will have a trip count of 1, so branch prediction will be improved. Speedup on Xeon E5-2680v4 is in the range of 3-5%. Measured wildcopy length distributions on silesia.tar: level <=8 <=16 <=24 >24 1 78.05% 11.49% 3.52% 6.94% 3 82.14% 8.99% 2.44% 6.43% 6 85.81% 6.51% 2.92% 4.76% 8 83.02% 7.31% 3.64% 6.03% 10 84.13% 6.67% 3.29% 5.91% 15 77.58% 7.55% 5.21% 9.66% 16 80.07% 7.20% 3.98% 8.75% Test Plan: benchmark silesia, make checkdev
parent
d944197e79
commit
b830599582
|
@ -191,9 +191,11 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
|
||||||
/*-*******************************************
|
/*-*******************************************
|
||||||
* Shared functions to include for inlining
|
* Shared functions to include for inlining
|
||||||
*********************************************/
|
*********************************************/
|
||||||
|
FORCE_INLINE_ATTR
|
||||||
static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
|
static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
|
||||||
|
|
||||||
#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
|
#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
|
||||||
|
FORCE_INLINE_ATTR
|
||||||
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
||||||
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
||||||
|
|
||||||
|
@ -209,7 +211,7 @@ typedef enum {
|
||||||
/*! ZSTD_wildcopy() :
|
/*! ZSTD_wildcopy() :
|
||||||
* custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
|
* custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
|
||||||
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
|
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
|
||||||
void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
|
void ZSTD_wildcopy(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype)
|
||||||
{
|
{
|
||||||
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
|
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
|
||||||
const BYTE* ip = (const BYTE*)src;
|
const BYTE* ip = (const BYTE*)src;
|
||||||
|
@ -217,25 +219,33 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
||||||
BYTE* const oend = op + length;
|
BYTE* const oend = op + length;
|
||||||
|
|
||||||
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
|
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
|
||||||
|
|
||||||
if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) {
|
if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) {
|
||||||
do
|
do
|
||||||
COPY8(op, ip)
|
COPY8(op, ip)
|
||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if ((length & 8) == 0)
|
if (oend < oend_g-16) {
|
||||||
COPY8(op, ip);
|
/* common case */
|
||||||
do {
|
do {
|
||||||
COPY16(op, ip);
|
COPY16(op, ip);
|
||||||
}
|
}
|
||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
do {
|
||||||
|
COPY8(op, ip);
|
||||||
|
}
|
||||||
|
while (op < oend);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! ZSTD_wildcopy_16min() :
|
/*! ZSTD_wildcopy_16min() :
|
||||||
* same semantics as ZSTD_wilcopy() except guaranteed to be able to copy 16 bytes at the start */
|
* same semantics as ZSTD_wildcopy() except guaranteed to be able to copy 16 bytes at the start */
|
||||||
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
|
MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
|
||||||
void ZSTD_wildcopy_16min(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
|
void ZSTD_wildcopy_16min(void* dst, const void* src, BYTE* oend_g, ptrdiff_t length, ZSTD_overlap_e ovtype)
|
||||||
{
|
{
|
||||||
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
|
ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
|
||||||
const BYTE* ip = (const BYTE*)src;
|
const BYTE* ip = (const BYTE*)src;
|
||||||
|
@ -246,18 +256,26 @@ void ZSTD_wildcopy_16min(void* dst, const void* src, ptrdiff_t length, ZSTD_over
|
||||||
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
|
assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
|
||||||
|
|
||||||
if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) {
|
if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) {
|
||||||
do
|
do {
|
||||||
COPY8(op, ip)
|
COPY8(op, ip);
|
||||||
|
}
|
||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if ((length & 8) == 0)
|
if (oend < oend_g-16) {
|
||||||
COPY8(op, ip);
|
/* common case */
|
||||||
do {
|
do {
|
||||||
COPY16(op, ip);
|
COPY16(op, ip);
|
||||||
}
|
}
|
||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
do {
|
||||||
|
COPY8(op, ip);
|
||||||
|
}
|
||||||
|
while (op < oend);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */
|
MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */
|
||||||
|
|
|
@ -359,7 +359,7 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
|
||||||
/* copy Literals */
|
/* copy Literals */
|
||||||
assert(seqStorePtr->maxNbLit <= 128 KB);
|
assert(seqStorePtr->maxNbLit <= 128 KB);
|
||||||
assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
|
assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
|
||||||
ZSTD_wildcopy(seqStorePtr->lit, literals, (ptrdiff_t)litLength, ZSTD_no_overlap);
|
ZSTD_wildcopy(seqStorePtr->lit, literals, seqStorePtr->lit + litLength + 8, (ptrdiff_t)litLength, ZSTD_no_overlap);
|
||||||
seqStorePtr->lit += litLength;
|
seqStorePtr->lit += litLength;
|
||||||
|
|
||||||
/* literal Length */
|
/* literal Length */
|
||||||
|
|
|
@ -641,7 +641,7 @@ size_t ZSTD_execSequence(BYTE* op,
|
||||||
|
|
||||||
/* copy Literals */
|
/* copy Literals */
|
||||||
if (sequence.litLength > 8)
|
if (sequence.litLength > 8)
|
||||||
ZSTD_wildcopy_16min(op, (*litPtr), sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
|
ZSTD_wildcopy_16min(op, (*litPtr), oend, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
|
||||||
else
|
else
|
||||||
ZSTD_copy8(op, *litPtr);
|
ZSTD_copy8(op, *litPtr);
|
||||||
op = oLitEnd;
|
op = oLitEnd;
|
||||||
|
@ -690,13 +690,13 @@ size_t ZSTD_execSequence(BYTE* op,
|
||||||
|
|
||||||
if (oMatchEnd > oend-(16-MINMATCH)) {
|
if (oMatchEnd > oend-(16-MINMATCH)) {
|
||||||
if (op < oend_w) {
|
if (op < oend_w) {
|
||||||
ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
|
ZSTD_wildcopy(op, match, oend, oend_w - op, ZSTD_overlap_src_before_dst);
|
||||||
match += oend_w - op;
|
match += oend_w - op;
|
||||||
op = oend_w;
|
op = oend_w;
|
||||||
}
|
}
|
||||||
while (op < oMatchEnd) *op++ = *match++;
|
while (op < oMatchEnd) *op++ = *match++;
|
||||||
} else {
|
} else {
|
||||||
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
|
ZSTD_wildcopy(op, match, oend, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
|
||||||
}
|
}
|
||||||
return sequenceLength;
|
return sequenceLength;
|
||||||
}
|
}
|
||||||
|
@ -722,7 +722,7 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
||||||
|
|
||||||
/* copy Literals */
|
/* copy Literals */
|
||||||
if (sequence.litLength > 8)
|
if (sequence.litLength > 8)
|
||||||
ZSTD_wildcopy_16min(op, *litPtr, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
|
ZSTD_wildcopy_16min(op, *litPtr, oend, sequence.litLength, ZSTD_no_overlap); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
|
||||||
else
|
else
|
||||||
ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */
|
ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */
|
||||||
|
|
||||||
|
@ -772,13 +772,13 @@ size_t ZSTD_execSequenceLong(BYTE* op,
|
||||||
|
|
||||||
if (oMatchEnd > oend-(16-MINMATCH)) {
|
if (oMatchEnd > oend-(16-MINMATCH)) {
|
||||||
if (op < oend_w) {
|
if (op < oend_w) {
|
||||||
ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
|
ZSTD_wildcopy(op, match, oend, oend_w - op, ZSTD_overlap_src_before_dst);
|
||||||
match += oend_w - op;
|
match += oend_w - op;
|
||||||
op = oend_w;
|
op = oend_w;
|
||||||
}
|
}
|
||||||
while (op < oMatchEnd) *op++ = *match++;
|
while (op < oMatchEnd) *op++ = *match++;
|
||||||
} else {
|
} else {
|
||||||
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
|
ZSTD_wildcopy(op, match, oend, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); /* works even if matchLength < 8 */
|
||||||
}
|
}
|
||||||
return sequenceLength;
|
return sequenceLength;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue