force inlining of HUF_decodeSymbol*() functions
which was not done properly by gcc 4.8 resulting in major performance difference. ex : zstd -b1 silesia.tar before : dec 680 MB/s after : dec 710 MB/s (without bmi2) after : dec 770 MB/s (with DYNAMIC_BMI2)dev
parent
564cb1b640
commit
b01552a07a
|
@ -143,7 +143,8 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
|
|||
|
||||
typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */
|
||||
|
||||
static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
|
||||
FORCE_INLINE_TEMPLATE BYTE
|
||||
HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
|
||||
{
|
||||
size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
|
||||
BYTE const c = dt[val].byte;
|
||||
|
@ -305,7 +306,8 @@ HUF_decompress4X2_usingDTable_internal_body(
|
|||
}
|
||||
|
||||
|
||||
static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
|
||||
FORCE_INLINE_TEMPLATE U32
|
||||
HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
|
||||
{
|
||||
size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
|
||||
memcpy(op, dt+val, 2);
|
||||
|
|
Loading…
Reference in New Issue