Improve zstd_opt build speed and size
Use the same trick as we did for zstd_lazy in PR #2828: * Create one search function specialization for each (dictMode, mls). * Select the search function pointer at the top of the match finder. Additionally, we no longer inline `ZSTD_compressBlock_opt_generic` into every function, since `dictMode` is no longer used as a template. Create two specializations, for opt levels 0 and 2, and call one of the two specializations. Lastly, remove the hack that disabled inlining for zstd_opt for the Linux Kernel, as we've gotten most of the benefit already. Compilation time sees a ~4x reduction: | Compiler | Flags | Dev Time (s) | PR Time (s) | Delta | |----------|----------------------------------|--------------|-------------|-------| | gcc | -O3 | 10.1 | 2.3 | -77% | | gcc | -O3 -fsanitize=address,undefined | 61.1 | 10.2 | -83% | | clang | -O3 | 9.0 | 2.1 | -76% | | clang | -O3 -fsanitize=address,undefined | 33.5 | 5.1 | -84% | Build size is reduced by 150KB - 200KB: | Compiler | Dev libzstd.a Size (B) | PR libzstd.a Size (B) | Delta | |----------|------------------------|-----------------------|-------| | gcc | 1327476 | 1177108 | -11% | | clang | 1378324 | 1167780 | -15% | There is a <2% speed loss in all cases: | Compiler | Level | Dev Speed (MB/s) | PR Speed (MB/s) | Delta | |----------|-------|------------------|-----------------|--------| | gcc | 16 | 4.78 | 4.72 | -1.25% | | gcc | 17 | 3.49 | 3.46 | -0.85% | | gcc | 18 | 2.92 | 2.86 | -2.04% | | gcc | 19 | 2.61 | 2.61 | 0.00% | | clang | 16 | 4.69 | 4.80 | 2.34% | | clang | 17 | 3.53 | 3.49 | -1.13% | | clang | 18 | 2.86 | 2.85 | -0.34% | | clang | 19 | 2.61 | 2.61 | 0.00% | Fixes Issue #2862.
This commit is contained in:
parent
21e28f5c24
commit
e5bfaeede7
@ -8,20 +8,6 @@
|
|||||||
* You may select, at your option, one of the above-listed licenses.
|
* You may select, at your option, one of the above-listed licenses.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* Disable inlining for the optimal parser for the kernel build.
|
|
||||||
* It is unlikely to be used in the kernel, and where it is used
|
|
||||||
* latency shouldn't matter because it is very slow to begin with.
|
|
||||||
* We prefer a ~180KB binary size win over faster optimal parsing.
|
|
||||||
*
|
|
||||||
* TODO(https://github.com/facebook/zstd/issues/2862):
|
|
||||||
* Improve the code size of the optimal parser in general, so we
|
|
||||||
* don't need this hack for the kernel build.
|
|
||||||
*/
|
|
||||||
#ifdef ZSTD_LINUX_KERNEL
|
|
||||||
#define ZSTD_NO_INLINE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "zstd_compress_internal.h"
|
#include "zstd_compress_internal.h"
|
||||||
#include "hist.h"
|
#include "hist.h"
|
||||||
#include "zstd_opt.h"
|
#include "zstd_opt.h"
|
||||||
@ -783,30 +769,83 @@ U32 ZSTD_insertBtAndGetAllMatches (
|
|||||||
return mnum;
|
return mnum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef U32 (*ZSTD_getAllMatchesFn)(
|
||||||
|
ZSTD_match_t*,
|
||||||
|
ZSTD_matchState_t*,
|
||||||
|
U32*,
|
||||||
|
const BYTE*,
|
||||||
|
const BYTE*,
|
||||||
|
const U32 rep[ZSTD_REP_NUM],
|
||||||
|
U32 const ll0,
|
||||||
|
U32 const lengthToBeat);
|
||||||
|
|
||||||
FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
|
FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
|
||||||
ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */
|
ZSTD_match_t* matches,
|
||||||
ZSTD_matchState_t* ms,
|
ZSTD_matchState_t* ms,
|
||||||
U32* nextToUpdate3,
|
U32* nextToUpdate3,
|
||||||
const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
|
const BYTE* ip,
|
||||||
const U32 rep[ZSTD_REP_NUM],
|
const BYTE* const iHighLimit,
|
||||||
U32 const ll0,
|
const U32 rep[ZSTD_REP_NUM],
|
||||||
U32 const lengthToBeat)
|
U32 const ll0,
|
||||||
|
U32 const lengthToBeat,
|
||||||
|
const ZSTD_dictMode_e dictMode,
|
||||||
|
const U32 mls)
|
||||||
{
|
{
|
||||||
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls);
|
||||||
U32 const matchLengthSearch = cParams->minMatch;
|
DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls);
|
||||||
DEBUGLOG(8, "ZSTD_BtGetAllMatches");
|
if (ip < ms->window.base + ms->nextToUpdate)
|
||||||
if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
|
return 0; /* skipped area */
|
||||||
ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
|
ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
|
||||||
switch(matchLengthSearch)
|
return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls);
|
||||||
{
|
}
|
||||||
case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3);
|
|
||||||
default :
|
#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls
|
||||||
case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4);
|
|
||||||
case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5);
|
#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \
|
||||||
case 7 :
|
static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \
|
||||||
case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
|
ZSTD_match_t* matches, \
|
||||||
|
ZSTD_matchState_t* ms, \
|
||||||
|
U32* nextToUpdate3, \
|
||||||
|
const BYTE* ip, \
|
||||||
|
const BYTE* const iHighLimit, \
|
||||||
|
const U32 rep[ZSTD_REP_NUM], \
|
||||||
|
U32 const ll0, \
|
||||||
|
U32 const lengthToBeat) \
|
||||||
|
{ \
|
||||||
|
return ZSTD_btGetAllMatches_internal( \
|
||||||
|
matches, ms, nextToUpdate3, ip, iHighLimit, \
|
||||||
|
rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
|
||||||
|
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
|
||||||
|
GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
|
||||||
|
|
||||||
|
#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \
|
||||||
|
{ \
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \
|
||||||
|
}
|
||||||
|
|
||||||
|
static ZSTD_getAllMatchesFn ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
|
||||||
|
{
|
||||||
|
ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
|
||||||
|
ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)
|
||||||
|
};
|
||||||
|
U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6);
|
||||||
|
assert((U32)dictMode < 3);
|
||||||
|
assert(mls - 3 < 4);
|
||||||
|
return getAllMatchesFns[(int)dictMode][mls - 3];
|
||||||
}
|
}
|
||||||
|
|
||||||
/*************************
|
/*************************
|
||||||
@ -989,6 +1028,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|||||||
const BYTE* const prefixStart = base + ms->window.dictLimit;
|
const BYTE* const prefixStart = base + ms->window.dictLimit;
|
||||||
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
const ZSTD_compressionParameters* const cParams = &ms->cParams;
|
||||||
|
|
||||||
|
ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode);
|
||||||
|
|
||||||
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
|
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
|
||||||
U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
|
U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
|
||||||
U32 nextToUpdate3 = ms->nextToUpdate;
|
U32 nextToUpdate3 = ms->nextToUpdate;
|
||||||
@ -1016,7 +1057,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|||||||
/* find first match */
|
/* find first match */
|
||||||
{ U32 const litlen = (U32)(ip - anchor);
|
{ U32 const litlen = (U32)(ip - anchor);
|
||||||
U32 const ll0 = !litlen;
|
U32 const ll0 = !litlen;
|
||||||
U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
|
U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
|
||||||
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
|
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
|
||||||
(U32)(ip-istart), (U32)(iend - ip));
|
(U32)(ip-istart), (U32)(iend - ip));
|
||||||
if (!nbMatches) { ip++; continue; }
|
if (!nbMatches) { ip++; continue; }
|
||||||
@ -1133,7 +1174,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
|
|||||||
U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
|
U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
|
||||||
U32 const previousPrice = (U32)opt[cur].price;
|
U32 const previousPrice = (U32)opt[cur].price;
|
||||||
U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
|
U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
|
||||||
U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
|
U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
|
||||||
U32 matchNb;
|
U32 matchNb;
|
||||||
|
|
||||||
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
|
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
|
||||||
@ -1257,13 +1298,26 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
|
|||||||
return (size_t)(iend - anchor);
|
return (size_t)(iend - anchor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t ZSTD_compressBlock_opt0(
|
||||||
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
|
const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
|
||||||
|
{
|
||||||
|
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ZSTD_compressBlock_opt2(
|
||||||
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
|
const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
|
||||||
|
{
|
||||||
|
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
|
||||||
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btopt(
|
size_t ZSTD_compressBlock_btopt(
|
||||||
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
DEBUGLOG(5, "ZSTD_compressBlock_btopt");
|
DEBUGLOG(5, "ZSTD_compressBlock_btopt");
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
|
return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1289,7 +1343,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
|
|||||||
assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */
|
assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */
|
||||||
assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */
|
assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */
|
||||||
|
|
||||||
ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/
|
ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
|
||||||
|
|
||||||
/* invalidate first scan from history */
|
/* invalidate first scan from history */
|
||||||
ZSTD_resetSeqStore(seqStore);
|
ZSTD_resetSeqStore(seqStore);
|
||||||
@ -1305,7 +1359,7 @@ size_t ZSTD_compressBlock_btultra(
|
|||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
|
DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
|
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btultra2(
|
size_t ZSTD_compressBlock_btultra2(
|
||||||
@ -1333,35 +1387,35 @@ size_t ZSTD_compressBlock_btultra2(
|
|||||||
ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
|
ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
|
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btopt_dictMatchState(
|
size_t ZSTD_compressBlock_btopt_dictMatchState(
|
||||||
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
|
return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btultra_dictMatchState(
|
size_t ZSTD_compressBlock_btultra_dictMatchState(
|
||||||
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
|
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btopt_extDict(
|
size_t ZSTD_compressBlock_btopt_extDict(
|
||||||
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
|
return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_compressBlock_btultra_extDict(
|
size_t ZSTD_compressBlock_btultra_extDict(
|
||||||
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
|
return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* note : no btultra2 variant for extDict nor dictMatchState,
|
/* note : no btultra2 variant for extDict nor dictMatchState,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user