speed up literal header decoding

dev
Nick Terrell 2020-08-16 22:22:33 -07:00
parent 6004c1117f
commit ba1fd17a9f
8 changed files with 274 additions and 65 deletions

View File

@ -96,7 +96,6 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
bitStream >>= 2 * repeats;
bitCount += 2 * repeats;
assert(bitCount < 30 && (bitStream & 3) != 3);
charnum += bitStream & 3;
bitCount += 2;
@ -186,6 +185,15 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize)
{
U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp));
}
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workSpace, size_t wkspSize)
{
U32 weightTotal;
const BYTE* ip = (const BYTE*) src;
@ -208,9 +216,8 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
huffWeight[n+1] = ip[n/2] & 15;
} } }
else { /* header compressed with FSE (normal case) */
FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize); /* max (hwSize-1) values decoded, as last one is implied */
if (FSE_isError(oSize)) return oSize;
}

View File

@ -311,7 +311,7 @@ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsi
* Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
* FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
*/
#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
@ -326,14 +326,21 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
*/
size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1 << maxTableLog) + 8)
#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned))
FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue))
#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
typedef enum {
FSE_repeat_none, /**< Cannot use the previous table */

View File

@ -68,17 +68,24 @@ void FSE_freeDTable (FSE_DTable* dt)
free(dt);
}
size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) {
U32 wksp[FSE_BUILD_DTABLE_WKSP_SIZE_U32(FSE_TABLELOG_ABSOLUTE_MAX, FSE_MAX_SYMBOL_VALUE)];
return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp));
}
size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
{
void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
U16* symbolNext = (U16*)workSpace;
BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
U32 const maxSV1 = maxSymbolValue + 1;
U32 const tableSize = 1 << tableLog;
U32 highThreshold = tableSize-1;
/* Sanity Checks */
if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
@ -100,7 +107,53 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
}
/* Spread symbols */
{ U32 const tableMask = tableSize-1;
if (highThreshold == tableSize - 1) {
size_t const tableMask = tableSize-1;
size_t const step = FSE_TABLESTEP(tableSize);
/* First lay down the symbols in order.
* We use a uint64_t to lay down 8 bytes at a time. This reduces branch
* misses since small blocks generally have small table logs, so nearly
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
* our buffer to handle the over-write.
*/
{
U64 const add = 0x0101010101010101ull;
size_t pos = 0;
U64 sv = 0;
U32 s;
for (s=0; s<maxSV1; ++s, sv += add) {
int i;
int const n = normalizedCounter[s];
MEM_write64(spread + pos, sv);
for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
pos += n;
}
}
/* Now we spread those positions across the table.
* The benefit of doing it in two stages is that we avoid the the
* variable size inner loop, which caused lots of branch misses.
* Now we can run through all the positions without any branch misses.
* We unroll the loop twice, since that is what emperically worked best.
*/
{
size_t position = 0;
size_t s;
size_t const unroll = 2;
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
for (s = 0; s < (size_t)tableSize; s += unroll) {
size_t u;
for (u = 0; u < unroll; ++u) {
size_t const uPosition = (position + (u * step)) & tableMask;
tableDecode[uPosition].symbol = spread[s + u];
}
position = (position + (unroll * step)) & tableMask;
}
assert(position == 0);
}
} else {
U32 const tableMask = tableSize-1;
U32 const step = FSE_TABLESTEP(tableSize);
U32 s, position = 0;
for (s=0; s<maxSV1; s++) {
@ -252,13 +305,14 @@ size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
}
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
{
const BYTE* const istart = (const BYTE*)cSrc;
const BYTE* ip = istart;
short counting[FSE_MAX_SYMBOL_VALUE+1];
unsigned tableLog;
unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
FSE_DTable* const dtable = (FSE_DTable*)workSpace;
/* normal FSE decoding mode */
size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
@ -268,9 +322,13 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
ip += NCountLength;
cSrcSize -= NCountLength;
CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog);
wkspSize -= FSE_DTABLE_SIZE(tableLog);
return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */
CHECK_F( FSE_buildDTable_wksp(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) );
return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, dtable); /* always return, even if it is an error code */
}
@ -278,8 +336,9 @@ typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
{
DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */
return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
/* Static analyzer seems unable to understand this table will be properly initialized later */
U32 wksp[FSE_DECOMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, FSE_MAX_TABLELOG, wksp, sizeof(wksp));
}

View File

@ -111,6 +111,8 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
/* *** Dependencies *** */
#include "mem.h" /* U32 */
#define FSE_STATIC_LINKING_ONLY
#include "fse.h"
/* *** Constants *** */
@ -226,6 +228,17 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize);
/*! HUF_readStats_wksp() :
* Same as HUF_readStats() but takes an external workspace which must be
* 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
*/
#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
const void* src, size_t srcSize,
void* workspace, size_t wkspSize);
/** HUF_readCTable() :
* Loading a CTable saved with HUF_writeCTable() */
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);

View File

@ -630,7 +630,7 @@ size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t src
size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
/* init conditions */
if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
if (wkspSize < FSE_COMPRESS_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
if (srcSize <= 1) return 0; /* Not compressible */
if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
@ -674,7 +674,7 @@ typedef struct {
size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
{
fseWkspMax_t scratchBuffer;
DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */
DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_COMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
}

View File

@ -115,6 +115,70 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
/*-***************************/
typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
/**
* Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
* a time.
*/
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
U64 D4;
if (MEM_isLittleEndian()) {
D4 = symbol + (nbBits << 8);
} else {
D4 = (symbol << 8) + nbBits;
}
D4 *= 0x0001000100010001ULL;
return D4;
}
#if 0
// TODO: Remove this
/* BMI2 version that uses _pdep_u64() for weight 1 and 2 symbols.
* This doesn't provide much gains, so not worth the complexity.
* Leaving in for now but will remove before I commit.
*/
#include <immintrin.h>
static U64 HUF_DEltX1_pack4(BYTE const* symbols, BYTE nbBits) {
U64 D4;
if (MEM_isLittleEndian()) {
U64 const nbBits4 = nbBits * 0x0100010001000100ULL;
U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0x00FF00FF00FF00FFULL);
D4 = symbols4 | nbBits4;
} else {
U64 const nbBits4 = nbBits * 0x0001000100010001ULL;
U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0xFF00FF00FF00FF00ULL);
D4 = symbols4 | nbBits4;
}
return D4;
}
static U64 HUF_DEltX1_pack2(BYTE const* symbols, BYTE nbBits) {
U64 D4;
if (MEM_isLittleEndian()) {
U64 const nbBits4 = nbBits * 0x0100010001000100ULL;
U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x000000FF000000FFULL);
symbols4 = symbols4 * 0x00010001ULL;
D4 = symbols4 | nbBits4;
} else {
U64 const nbBits4 = nbBits * 0x0001000100010001ULL;
U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x0000FF000000FF00ULL);
symbols4 *= 0x00010001ULL;
D4 = symbols4 | nbBits4;
}
return D4;
}
#endif
typedef struct {
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
} HUF_ReadDTableX1_Workspace;
// TODO: Template based on BMI2 (5% boost)
size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
{
U32 tableLog = 0;
@ -122,22 +186,15 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
size_t iSize;
void* const dtPtr = DTable + 1;
HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
U32* rankVal;
BYTE* huffWeight;
size_t spaceUsed32 = 0;
rankVal = (U32 *)workSpace + spaceUsed32;
spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp));
if (HUF_isError(iSize)) return iSize;
/* Table header */
@ -148,39 +205,103 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
memcpy(DTable, &dtd, sizeof(dtd));
}
/* Calculate starting value for each rank */
{ U32 n, nextRankStart = 0;
for (n=1; n<tableLog+1; n++) {
/* Compute symbols and rankStart given rankVal:
*
* rankVal already contains the number of values of each weight.
*
* symbols contains the symbols ordered by weight. First are the rankVal[0]
* weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
* symbols[0] is filled (but unused) to avoid a branch.
*
* rankStart contains the offset where each rank belongs in the DTable.
* rankStart[0] is not filled because there are no entries in the table for
* weight 0.
*/
{
int n;
int nextRankStart = 0;
for (n=0; n<(int)tableLog+1; n++) {
U32 const current = nextRankStart;
nextRankStart += (rankVal[n] << (n-1));
rankVal[n] = current;
} }
nextRankStart += wksp->rankVal[n];
wksp->rankStart[n] = current;
}
// TODO: This loop is now the bottleneck: Can this be made faster?
for (n=0; n < (int)nbSymbols; ++n) {
size_t const w = wksp->huffWeight[n];
wksp->symbols[wksp->rankStart[w]++] = n;
}
}
/* fill DTable */
{ U32 n;
size_t const nEnd = nbSymbols;
for (n=0; n<nEnd; n++) {
size_t const w = huffWeight[n];
size_t const length = (1 << w) >> 1;
size_t const uStart = rankVal[w];
size_t const uEnd = uStart + length;
size_t u;
HUF_DEltX1 D;
D.byte = (BYTE)n;
D.nbBits = (BYTE)(tableLog + 1 - w);
rankVal[w] = (U32)uEnd;
if (length < 4) {
/* Use length in the loop bound so the compiler knows it is short. */
for (u = 0; u < length; ++u)
dt[uStart + u] = D;
} else {
/* Unroll the loop 4 times, we know it is a power of 2. */
for (u = uStart; u < uEnd; u += 4) {
dt[u + 0] = D;
dt[u + 1] = D;
dt[u + 2] = D;
dt[u + 3] = D;
} } } }
/* fill DTable
* We fill all entries of each weight in order.
* That way length is a constant for each iteration of the outter loop.
* We can switch based on the length to a different inner loop which is
* optimized for that particular case.
*/
{
U32 w;
int symbol=wksp->rankVal[0];
int rankStart=0;
for (w=1; w<tableLog+1; ++w) {
int const symbolCount = wksp->rankVal[w];
int const length = (1 << w) >> 1;
int uStart = rankStart;
BYTE const nbBits = tableLog + 1 - w;
int s;
int u;
switch (length) {
case 1:
for (s=0; s<symbolCount; ++s) {
HUF_DEltX1 D;
D.byte = wksp->symbols[symbol + s];
D.nbBits = nbBits;
dt[uStart] = D;
uStart += 1;
}
break;
case 2:
for (s=0; s<symbolCount; ++s) {
HUF_DEltX1 D;
D.byte = wksp->symbols[symbol + s];
D.nbBits = nbBits;
dt[uStart+0] = D;
dt[uStart+1] = D;
uStart += 2;
}
break;
case 4:
for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
MEM_write64(dt + uStart, D4);
uStart += 4;
}
break;
case 8:
for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
MEM_write64(dt + uStart, D4);
MEM_write64(dt + uStart + 4, D4);
uStart += 8;
}
break;
default:
for (s=0; s<symbolCount; ++s) {
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
for (u=0; u < length; u += 16) {
MEM_write64(dt + uStart + u + 0, D4);
MEM_write64(dt + uStart + u + 4, D4);
MEM_write64(dt + uStart + u + 8, D4);
MEM_write64(dt + uStart + u + 12, D4);
}
assert(u == length);
uStart += length;
}
break;
}
symbol += symbolCount;
rankStart += symbolCount * length;
}
}
return iSize;
}

View File

@ -368,14 +368,17 @@ void
ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, U32* wksp, size_t wkspSize)
unsigned tableLog, void* wksp, size_t wkspSize)
{
ZSTD_seqSymbol* const tableDecode = dt+1;
U16 symbolNext[MaxSeq+1];
U32 const maxSV1 = maxSymbolValue + 1;
U32 const tableSize = 1 << tableLog;
U16* symbolNext = (U16*)wksp;
BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
/* Sanity Checks */
assert(maxSymbolValue <= MaxSeq);
assert(tableLog <= MaxFSELog);
@ -414,9 +417,6 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
* our buffer to handle the over-write.
*/
BYTE* spread = (BYTE*)wksp;
assert(wkspSize >= (1u << MaxFSELog) + sizeof(U64));
(void)wkspSize;
{
U64 const add = 0x0101010101010101ull;
size_t pos = 0;

View File

@ -48,12 +48,14 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
* this function must be called with valid parameters only
* (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
* in which case it cannot fail.
* The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes.
* Internal use only.
*/
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
const short* normalizedCounter, unsigned maxSymbolValue,
const U32* baseValue, const U32* nbAdditionalBits,
unsigned tableLog, U32* wksp, size_t wkspSize);
unsigned tableLog, void* wksp, size_t wkspSize);
#endif /* ZSTD_DEC_BLOCK_H */