From ba1fd17a9f90e87c827e470ca7a2656bc3def669 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Sun, 16 Aug 2020 22:22:33 -0700 Subject: [PATCH] speed up literal header decoding --- lib/common/entropy_common.c | 13 +- lib/common/fse.h | 13 +- lib/common/fse_decompress.c | 75 ++++++++- lib/common/huf.h | 13 ++ lib/compress/fse_compress.c | 4 +- lib/decompress/huf_decompress.c | 205 ++++++++++++++++++++----- lib/decompress/zstd_decompress_block.c | 12 +- lib/decompress/zstd_decompress_block.h | 4 +- 8 files changed, 274 insertions(+), 65 deletions(-) diff --git a/lib/common/entropy_common.c b/lib/common/entropy_common.c index 0a13d9d9..3969d2b3 100644 --- a/lib/common/entropy_common.c +++ b/lib/common/entropy_common.c @@ -96,7 +96,6 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t bitStream >>= 2 * repeats; bitCount += 2 * repeats; - assert(bitCount < 30 && (bitStream & 3) != 3); charnum += bitStream & 3; bitCount += 2; @@ -186,6 +185,15 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize) +{ + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; + return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp)); +} + +size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize) { U32 weightTotal; const BYTE* ip = (const BYTE*) src; @@ -208,9 +216,8 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, huffWeight[n+1] = ip[n/2] & 15; } } } else { /* header compressed with FSE (normal case) */ - FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */ if (iSize+1 > srcSize) return ERROR(srcSize_wrong); - oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */ + oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize); /* max (hwSize-1) values decoded, as last one is implied */ if (FSE_isError(oSize)) return oSize; } diff --git a/lib/common/fse.h b/lib/common/fse.h index 55dd8f3a..8d15b340 100644 --- a/lib/common/fse.h +++ b/lib/common/fse.h @@ -311,7 +311,7 @@ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsi * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. */ -#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); @@ -326,14 +326,21 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); */ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1 << maxTableLog) + 8) +#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned)) +FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); /**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); /**< build a fake FSE_DTable, designed to always generate the same symbolValue */ -size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog); -/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */ +#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)) +#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ typedef enum { FSE_repeat_none, /**< Cannot use the previous table */ diff --git a/lib/common/fse_decompress.c b/lib/common/fse_decompress.c index 54dab255..119ee7b6 100644 --- a/lib/common/fse_decompress.c +++ b/lib/common/fse_decompress.c @@ -68,17 +68,24 @@ void FSE_freeDTable (FSE_DTable* dt) free(dt); } -size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { + U32 wksp[FSE_BUILD_DTABLE_WKSP_SIZE_U32(FSE_TABLELOG_ABSOLUTE_MAX, FSE_MAX_SYMBOL_VALUE)]; + return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp)); +} + +size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) { void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); - U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + U16* symbolNext = (U16*)workSpace; + BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1); U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; U32 highThreshold = tableSize-1; /* Sanity Checks */ + if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge); if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); @@ -100,7 +107,53 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned } /* Spread symbols */ - { U32 const tableMask = tableSize-1; + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { + U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s wkspSize) return ERROR(tableLog_tooLarge); + workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog); + wkspSize -= FSE_DTABLE_SIZE(tableLog); - return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ + CHECK_F( FSE_buildDTable_wksp(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, dtable); /* always return, even if it is an error code */ } @@ -278,8 +336,9 @@ typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) { - DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ - return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); + /* Static analyzer seems unable to understand this table will be properly initialized later */ + U32 wksp[FSE_DECOMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)]; + return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, FSE_MAX_TABLELOG, wksp, sizeof(wksp)); } diff --git a/lib/common/huf.h b/lib/common/huf.h index ef432685..90e613a1 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -111,6 +111,8 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, /* *** Dependencies *** */ #include "mem.h" /* U32 */ +#define FSE_STATIC_LINKING_ONLY +#include "fse.h" /* *** Constants *** */ @@ -226,6 +228,17 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize); +/*! HUF_readStats_wksp() : + * Same as HUF_readStats() but takes an external workspace which must be + * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE. + */ +#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1) +#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned)) +size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize); + /** HUF_readCTable() : * Loading a CTable saved with HUF_writeCTable() */ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c index 1187e3e6..48b654d0 100644 --- a/lib/compress/fse_compress.c +++ b/lib/compress/fse_compress.c @@ -630,7 +630,7 @@ size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t src size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable)); /* init conditions */ - if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge); + if (wkspSize < FSE_COMPRESS_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge); if (srcSize <= 1) return 0; /* Not compressible */ if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE; if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG; @@ -674,7 +674,7 @@ typedef struct { size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog) { fseWkspMax_t scratchBuffer; - DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */ + DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_COMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */ if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer)); } diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 68293a13..fbe4127a 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -115,6 +115,70 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) /*-***************************/ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ +/** + * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at + * a time. + */ +static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { + D4 = symbol + (nbBits << 8); + } else { + D4 = (symbol << 8) + nbBits; + } + D4 *= 0x0001000100010001ULL; + return D4; +} + +#if 0 +// TODO: Remove this +/* BMI2 version that uses _pdep_u64() for weight 1 and 2 symbols. + * This doesn't provide much gains, so not worth the complexity. + * Leaving in for now but will remove before I commit. + */ +#include + +static U64 HUF_DEltX1_pack4(BYTE const* symbols, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { + U64 const nbBits4 = nbBits * 0x0100010001000100ULL; + U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0x00FF00FF00FF00FFULL); + D4 = symbols4 | nbBits4; + } else { + U64 const nbBits4 = nbBits * 0x0001000100010001ULL; + U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0xFF00FF00FF00FF00ULL); + D4 = symbols4 | nbBits4; + } + return D4; +} + +static U64 HUF_DEltX1_pack2(BYTE const* symbols, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { + U64 const nbBits4 = nbBits * 0x0100010001000100ULL; + U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x000000FF000000FFULL); + symbols4 = symbols4 * 0x00010001ULL; + D4 = symbols4 | nbBits4; + } else { + U64 const nbBits4 = nbBits * 0x0001000100010001ULL; + U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x0000FF000000FF00ULL); + symbols4 *= 0x00010001ULL; + D4 = symbols4 | nbBits4; + } + return D4; +} +#endif + +typedef struct { + U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; + BYTE symbols[HUF_SYMBOLVALUE_MAX + 1]; + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; +} HUF_ReadDTableX1_Workspace; + + +// TODO: Template based on BMI2 (5% boost) size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { U32 tableLog = 0; @@ -122,22 +186,15 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize size_t iSize; void* const dtPtr = DTable + 1; HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; + HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace; - U32* rankVal; - BYTE* huffWeight; - size_t spaceUsed32 = 0; - - rankVal = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; - huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; - - if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); + if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge); DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); + iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp)); if (HUF_isError(iSize)) return iSize; /* Table header */ @@ -148,39 +205,103 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize memcpy(DTable, &dtd, sizeof(dtd)); } - /* Calculate starting value for each rank */ - { U32 n, nextRankStart = 0; - for (n=1; nrankVal[n]; + wksp->rankStart[n] = current; + } + // TODO: This loop is now the bottleneck: Can this be made faster? + for (n=0; n < (int)nbSymbols; ++n) { + size_t const w = wksp->huffWeight[n]; + wksp->symbols[wksp->rankStart[w]++] = n; + } + } - /* fill DTable */ - { U32 n; - size_t const nEnd = nbSymbols; - for (n=0; n> 1; - size_t const uStart = rankVal[w]; - size_t const uEnd = uStart + length; - size_t u; - HUF_DEltX1 D; - D.byte = (BYTE)n; - D.nbBits = (BYTE)(tableLog + 1 - w); - rankVal[w] = (U32)uEnd; - if (length < 4) { - /* Use length in the loop bound so the compiler knows it is short. */ - for (u = 0; u < length; ++u) - dt[uStart + u] = D; - } else { - /* Unroll the loop 4 times, we know it is a power of 2. */ - for (u = uStart; u < uEnd; u += 4) { - dt[u + 0] = D; - dt[u + 1] = D; - dt[u + 2] = D; - dt[u + 3] = D; - } } } } + /* fill DTable + * We fill all entries of each weight in order. + * That way length is a constant for each iteration of the outter loop. + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ + { + U32 w; + int symbol=wksp->rankVal[0]; + int rankStart=0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; + int uStart = rankStart; + BYTE const nbBits = tableLog + 1 - w; + int s; + int u; + switch (length) { + case 1: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart] = D; + uStart += 1; + } + break; + case 2: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart+0] = D; + dt[uStart+1] = D; + uStart += 2; + } + break; + case 4: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + uStart += 4; + } + break; + case 8: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + MEM_write64(dt + uStart + 4, D4); + uStart += 8; + } + break; + default: + for (s=0; ssymbols[symbol + s], nbBits); + for (u=0; u < length; u += 16) { + MEM_write64(dt + uStart + u + 0, D4); + MEM_write64(dt + uStart + u + 4, D4); + MEM_write64(dt + uStart + u + 8, D4); + MEM_write64(dt + uStart + u + 12, D4); + } + assert(u == length); + uStart += length; + } + break; + } + symbol += symbolCount; + rankStart += symbolCount * length; + } + } return iSize; } diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 95afcaa3..51ef977f 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -368,14 +368,17 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U32* nbAdditionalBits, - unsigned tableLog, U32* wksp, size_t wkspSize) + unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; - U16 symbolNext[MaxSeq+1]; - U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; + U16* symbolNext = (U16*)wksp; + BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); + + assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); + /* Sanity Checks */ assert(maxSymbolValue <= MaxSeq); assert(tableLog <= MaxFSELog); @@ -414,9 +417,6 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt, * all symbols have counts <= 8. We ensure we have 8 bytes at the end of * our buffer to handle the over-write. */ - BYTE* spread = (BYTE*)wksp; - assert(wkspSize >= (1u << MaxFSELog) + sizeof(U64)); - (void)wkspSize; { U64 const add = 0x0101010101010101ull; size_t pos = 0; diff --git a/lib/decompress/zstd_decompress_block.h b/lib/decompress/zstd_decompress_block.h index 201d6a9f..03afdde4 100644 --- a/lib/decompress/zstd_decompress_block.h +++ b/lib/decompress/zstd_decompress_block.h @@ -48,12 +48,14 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, * this function must be called with valid parameters only * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.) * in which case it cannot fail. + * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes. * Internal use only. */ +#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U32* nbAdditionalBits, - unsigned tableLog, U32* wksp, size_t wkspSize); + unsigned tableLog, void* wksp, size_t wkspSize); #endif /* ZSTD_DEC_BLOCK_H */