Reduce function size in fast & dfast

Take the same approach as in PR #2828 [0] to remove functions that force
inline many function bodies and `switch`. Instead, create one function per
"template" combination, and then switch between these functions. This
allows the compiler to break the large function into many small
functions, which generally helps codegen.

Also, in the `extDict` modes when there is no ext-dict, call the top
level function instead of the force inlined one, to save on code size.

I'm specifically doing this because gcc on the parisc architecture doesn't
handle the large function body well, and ends up using a lot of excess
stack space. Outlining these functions fixes it.
This commit is contained in:
Nick Terrell 2021-11-15 17:25:24 -08:00
parent ddae153947
commit 802ea885ef
2 changed files with 71 additions and 27 deletions

View File

@ -468,6 +468,24 @@ _match_stored:
return (size_t)(iend - anchor); return (size_t)(iend - anchor);
} }
#define ZSTD_GEN_DFAST_FN(dictMode, mls) \
static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
void const* src, size_t srcSize) \
{ \
return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
}
ZSTD_GEN_DFAST_FN(noDict, 4)
ZSTD_GEN_DFAST_FN(noDict, 5)
ZSTD_GEN_DFAST_FN(noDict, 6)
ZSTD_GEN_DFAST_FN(noDict, 7)
ZSTD_GEN_DFAST_FN(dictMatchState, 4)
ZSTD_GEN_DFAST_FN(dictMatchState, 5)
ZSTD_GEN_DFAST_FN(dictMatchState, 6)
ZSTD_GEN_DFAST_FN(dictMatchState, 7)
size_t ZSTD_compressBlock_doubleFast( size_t ZSTD_compressBlock_doubleFast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@ -478,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
} }
} }
@ -498,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
} }
} }
@ -540,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
/* if extDict is invalidated due to maxDistance, switch to "regular" variant */ /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
if (prefixStartIndex == dictStartIndex) if (prefixStartIndex == dictStartIndex)
return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, mls); return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
/* Search Loop */ /* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
@ -653,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
return (size_t)(iend - anchor); return (size_t)(iend - anchor);
} }
ZSTD_GEN_DFAST_FN(extDict, 4)
ZSTD_GEN_DFAST_FN(extDict, 5)
ZSTD_GEN_DFAST_FN(extDict, 6)
ZSTD_GEN_DFAST_FN(extDict, 7)
size_t ZSTD_compressBlock_doubleFast_extDict( size_t ZSTD_compressBlock_doubleFast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@ -663,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
} }
} }

View File

@ -90,7 +90,7 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
* This is also the work we do at the beginning to enter the loop initially. * This is also the work we do at the beginning to enter the loop initially.
*/ */
FORCE_INLINE_TEMPLATE size_t FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_fast_generic( ZSTD_compressBlock_fast_noDict_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize, void const* src, size_t srcSize,
U32 const mls) U32 const mls)
@ -310,6 +310,18 @@ _match: /* Requires: ip0, match0, offcode */
goto _start; goto _start;
} }
#define ZSTD_GEN_FAST_FN(dictMode, mls) \
static size_t ZSTD_compressBlock_fast_##dictMode##_##mls( \
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
void const* src, size_t srcSize) \
{ \
return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
}
ZSTD_GEN_FAST_FN(noDict, 4)
ZSTD_GEN_FAST_FN(noDict, 5)
ZSTD_GEN_FAST_FN(noDict, 6)
ZSTD_GEN_FAST_FN(noDict, 7)
size_t ZSTD_compressBlock_fast( size_t ZSTD_compressBlock_fast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@ -321,13 +333,13 @@ size_t ZSTD_compressBlock_fast(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_fast_noDict_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_fast_noDict_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_fast_noDict_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_fast_noDict_7(ms, seqStore, rep, src, srcSize);
} }
} }
@ -479,6 +491,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
return (size_t)(iend - anchor); return (size_t)(iend - anchor);
} }
ZSTD_GEN_FAST_FN(dictMatchState, 4)
ZSTD_GEN_FAST_FN(dictMatchState, 5)
ZSTD_GEN_FAST_FN(dictMatchState, 6)
ZSTD_GEN_FAST_FN(dictMatchState, 7)
size_t ZSTD_compressBlock_fast_dictMatchState( size_t ZSTD_compressBlock_fast_dictMatchState(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize) void const* src, size_t srcSize)
@ -489,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_fast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_fast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_fast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_fast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
} }
} }
@ -530,7 +548,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
/* switch to "regular" variant if extDict is invalidated due to maxDistance */ /* switch to "regular" variant if extDict is invalidated due to maxDistance */
if (prefixStartIndex == dictStartIndex) if (prefixStartIndex == dictStartIndex)
return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
/* Search Loop */ /* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */
@ -603,6 +621,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
return (size_t)(iend - anchor); return (size_t)(iend - anchor);
} }
ZSTD_GEN_FAST_FN(extDict, 4)
ZSTD_GEN_FAST_FN(extDict, 5)
ZSTD_GEN_FAST_FN(extDict, 6)
ZSTD_GEN_FAST_FN(extDict, 7)
size_t ZSTD_compressBlock_fast_extDict( size_t ZSTD_compressBlock_fast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@ -613,12 +635,12 @@ size_t ZSTD_compressBlock_fast_extDict(
{ {
default: /* includes case 3 */ default: /* includes case 3 */
case 4 : case 4 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); return ZSTD_compressBlock_fast_extDict_4(ms, seqStore, rep, src, srcSize);
case 5 : case 5 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); return ZSTD_compressBlock_fast_extDict_5(ms, seqStore, rep, src, srcSize);
case 6 : case 6 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); return ZSTD_compressBlock_fast_extDict_6(ms, seqStore, rep, src, srcSize);
case 7 : case 7 :
return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); return ZSTD_compressBlock_fast_extDict_7(ms, seqStore, rep, src, srcSize);
} }
} }