force inlining of HUF_decodeSymbol*() functions

which was not done properly by gcc 4.8
resulting in major performance difference.

ex :
zstd -b1 silesia.tar
before : dec 680 MB/s
after  : dec 710 MB/s  (without bmi2)
after  : dec 770 MB/s  (with DYNAMIC_BMI2)
dev
Yann Collet 2018-03-01 11:28:42 -08:00
parent 564cb1b640
commit b01552a07a
1 changed files with 4 additions and 2 deletions

View File

@ -143,7 +143,8 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
FORCE_INLINE_TEMPLATE BYTE
HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
{
size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
BYTE const c = dt[val].byte;
@ -305,7 +306,8 @@ HUF_decompress4X2_usingDTable_internal_body(
}
static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
FORCE_INLINE_TEMPLATE U32
HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
{
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
memcpy(op, dt+val, 2);