From b01552a07ae20b5ed28fdb2084dbe9dadd1a6d9a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 1 Mar 2018 11:28:42 -0800 Subject: [PATCH] force inlining of HUF_decodeSymbol*() functions which was not done properly by gcc 4.8 resulting in major performance difference. ex : zstd -b1 silesia.tar before : dec 680 MB/s after : dec 710 MB/s (without bmi2) after : dec 770 MB/s (with DYNAMIC_BMI2) --- lib/decompress/huf_decompress.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index d2e1bb4a..8b292e79 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -143,7 +143,8 @@ size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4; /* double-symbols decoding */ -static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog) +FORCE_INLINE_TEMPLATE BYTE +HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ BYTE const c = dt[val].byte; @@ -305,7 +306,8 @@ HUF_decompress4X2_usingDTable_internal_body( } -static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) +FORCE_INLINE_TEMPLATE U32 +HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ memcpy(op, dt+val, 2);