speed up literal header decoding
parent
6004c1117f
commit
ba1fd17a9f
|
@ -96,7 +96,6 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
|
|||
bitStream >>= 2 * repeats;
|
||||
bitCount += 2 * repeats;
|
||||
|
||||
assert(bitCount < 30 && (bitStream & 3) != 3);
|
||||
charnum += bitStream & 3;
|
||||
bitCount += 2;
|
||||
|
||||
|
@ -186,6 +185,15 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t
|
|||
size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
||||
U32* nbSymbolsPtr, U32* tableLogPtr,
|
||||
const void* src, size_t srcSize)
|
||||
{
|
||||
U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
|
||||
return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp));
|
||||
}
|
||||
|
||||
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
||||
U32* nbSymbolsPtr, U32* tableLogPtr,
|
||||
const void* src, size_t srcSize,
|
||||
void* workSpace, size_t wkspSize)
|
||||
{
|
||||
U32 weightTotal;
|
||||
const BYTE* ip = (const BYTE*) src;
|
||||
|
@ -208,9 +216,8 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
|
|||
huffWeight[n+1] = ip[n/2] & 15;
|
||||
} } }
|
||||
else { /* header compressed with FSE (normal case) */
|
||||
FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
|
||||
if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
|
||||
oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */
|
||||
oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize); /* max (hwSize-1) values decoded, as last one is implied */
|
||||
if (FSE_isError(oSize)) return oSize;
|
||||
}
|
||||
|
||||
|
|
|
@ -311,7 +311,7 @@ unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsi
|
|||
* Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
|
||||
* FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
|
||||
*/
|
||||
#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
|
||||
#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
|
||||
size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
|
||||
|
||||
size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
|
||||
|
@ -326,14 +326,21 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
|
|||
*/
|
||||
size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
|
||||
|
||||
#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1 << maxTableLog) + 8)
|
||||
#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned))
|
||||
FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
|
||||
/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
|
||||
|
||||
size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
|
||||
/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
|
||||
|
||||
size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
|
||||
/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
|
||||
|
||||
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
|
||||
/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
|
||||
#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue))
|
||||
#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
|
||||
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
|
||||
/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
|
||||
|
||||
typedef enum {
|
||||
FSE_repeat_none, /**< Cannot use the previous table */
|
||||
|
|
|
@ -68,17 +68,24 @@ void FSE_freeDTable (FSE_DTable* dt)
|
|||
free(dt);
|
||||
}
|
||||
|
||||
size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
|
||||
size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) {
|
||||
U32 wksp[FSE_BUILD_DTABLE_WKSP_SIZE_U32(FSE_TABLELOG_ABSOLUTE_MAX, FSE_MAX_SYMBOL_VALUE)];
|
||||
return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp));
|
||||
}
|
||||
|
||||
size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
|
||||
{
|
||||
void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
|
||||
FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
|
||||
U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
|
||||
U16* symbolNext = (U16*)workSpace;
|
||||
BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1);
|
||||
|
||||
U32 const maxSV1 = maxSymbolValue + 1;
|
||||
U32 const tableSize = 1 << tableLog;
|
||||
U32 highThreshold = tableSize-1;
|
||||
|
||||
/* Sanity Checks */
|
||||
if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge);
|
||||
if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
|
||||
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
|
||||
|
||||
|
@ -100,7 +107,53 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
|
|||
}
|
||||
|
||||
/* Spread symbols */
|
||||
{ U32 const tableMask = tableSize-1;
|
||||
if (highThreshold == tableSize - 1) {
|
||||
size_t const tableMask = tableSize-1;
|
||||
size_t const step = FSE_TABLESTEP(tableSize);
|
||||
/* First lay down the symbols in order.
|
||||
* We use a uint64_t to lay down 8 bytes at a time. This reduces branch
|
||||
* misses since small blocks generally have small table logs, so nearly
|
||||
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
|
||||
* our buffer to handle the over-write.
|
||||
*/
|
||||
{
|
||||
U64 const add = 0x0101010101010101ull;
|
||||
size_t pos = 0;
|
||||
U64 sv = 0;
|
||||
U32 s;
|
||||
for (s=0; s<maxSV1; ++s, sv += add) {
|
||||
int i;
|
||||
int const n = normalizedCounter[s];
|
||||
MEM_write64(spread + pos, sv);
|
||||
for (i = 8; i < n; i += 8) {
|
||||
MEM_write64(spread + pos + i, sv);
|
||||
}
|
||||
pos += n;
|
||||
}
|
||||
}
|
||||
/* Now we spread those positions across the table.
|
||||
* The benefit of doing it in two stages is that we avoid the the
|
||||
* variable size inner loop, which caused lots of branch misses.
|
||||
* Now we can run through all the positions without any branch misses.
|
||||
* We unroll the loop twice, since that is what emperically worked best.
|
||||
*/
|
||||
{
|
||||
size_t position = 0;
|
||||
size_t s;
|
||||
size_t const unroll = 2;
|
||||
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
|
||||
for (s = 0; s < (size_t)tableSize; s += unroll) {
|
||||
size_t u;
|
||||
for (u = 0; u < unroll; ++u) {
|
||||
size_t const uPosition = (position + (u * step)) & tableMask;
|
||||
tableDecode[uPosition].symbol = spread[s + u];
|
||||
}
|
||||
position = (position + (unroll * step)) & tableMask;
|
||||
}
|
||||
assert(position == 0);
|
||||
}
|
||||
} else {
|
||||
U32 const tableMask = tableSize-1;
|
||||
U32 const step = FSE_TABLESTEP(tableSize);
|
||||
U32 s, position = 0;
|
||||
for (s=0; s<maxSV1; s++) {
|
||||
|
@ -252,13 +305,14 @@ size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
|
|||
}
|
||||
|
||||
|
||||
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
|
||||
size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
|
||||
{
|
||||
const BYTE* const istart = (const BYTE*)cSrc;
|
||||
const BYTE* ip = istart;
|
||||
short counting[FSE_MAX_SYMBOL_VALUE+1];
|
||||
unsigned tableLog;
|
||||
unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
|
||||
FSE_DTable* const dtable = (FSE_DTable*)workSpace;
|
||||
|
||||
/* normal FSE decoding mode */
|
||||
size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
|
||||
|
@ -268,9 +322,13 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
|
|||
ip += NCountLength;
|
||||
cSrcSize -= NCountLength;
|
||||
|
||||
CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
|
||||
if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
|
||||
workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog);
|
||||
wkspSize -= FSE_DTABLE_SIZE(tableLog);
|
||||
|
||||
return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */
|
||||
CHECK_F( FSE_buildDTable_wksp(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) );
|
||||
|
||||
return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, dtable); /* always return, even if it is an error code */
|
||||
}
|
||||
|
||||
|
||||
|
@ -278,8 +336,9 @@ typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
|
|||
|
||||
size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
|
||||
{
|
||||
DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */
|
||||
return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
|
||||
/* Static analyzer seems unable to understand this table will be properly initialized later */
|
||||
U32 wksp[FSE_DECOMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
|
||||
return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, FSE_MAX_TABLELOG, wksp, sizeof(wksp));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -111,6 +111,8 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
|
|||
|
||||
/* *** Dependencies *** */
|
||||
#include "mem.h" /* U32 */
|
||||
#define FSE_STATIC_LINKING_ONLY
|
||||
#include "fse.h"
|
||||
|
||||
|
||||
/* *** Constants *** */
|
||||
|
@ -226,6 +228,17 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
|
|||
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
|
||||
const void* src, size_t srcSize);
|
||||
|
||||
/*! HUF_readStats_wksp() :
|
||||
* Same as HUF_readStats() but takes an external workspace which must be
|
||||
* 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE.
|
||||
*/
|
||||
#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1)
|
||||
#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned))
|
||||
size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
|
||||
U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
|
||||
const void* src, size_t srcSize,
|
||||
void* workspace, size_t wkspSize);
|
||||
|
||||
/** HUF_readCTable() :
|
||||
* Loading a CTable saved with HUF_writeCTable() */
|
||||
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
|
||||
|
|
|
@ -630,7 +630,7 @@ size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t src
|
|||
size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
|
||||
|
||||
/* init conditions */
|
||||
if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
|
||||
if (wkspSize < FSE_COMPRESS_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
|
||||
if (srcSize <= 1) return 0; /* Not compressible */
|
||||
if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
|
||||
if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
|
||||
|
@ -674,7 +674,7 @@ typedef struct {
|
|||
size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
|
||||
{
|
||||
fseWkspMax_t scratchBuffer;
|
||||
DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */
|
||||
DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_COMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */
|
||||
if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
|
||||
return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
|
||||
}
|
||||
|
|
|
@ -115,6 +115,70 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
|
|||
/*-***************************/
|
||||
typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */
|
||||
|
||||
/**
|
||||
* Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at
|
||||
* a time.
|
||||
*/
|
||||
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
|
||||
U64 D4;
|
||||
if (MEM_isLittleEndian()) {
|
||||
D4 = symbol + (nbBits << 8);
|
||||
} else {
|
||||
D4 = (symbol << 8) + nbBits;
|
||||
}
|
||||
D4 *= 0x0001000100010001ULL;
|
||||
return D4;
|
||||
}
|
||||
|
||||
#if 0
|
||||
// TODO: Remove this
|
||||
/* BMI2 version that uses _pdep_u64() for weight 1 and 2 symbols.
|
||||
* This doesn't provide much gains, so not worth the complexity.
|
||||
* Leaving in for now but will remove before I commit.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
|
||||
static U64 HUF_DEltX1_pack4(BYTE const* symbols, BYTE nbBits) {
|
||||
U64 D4;
|
||||
if (MEM_isLittleEndian()) {
|
||||
U64 const nbBits4 = nbBits * 0x0100010001000100ULL;
|
||||
U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0x00FF00FF00FF00FFULL);
|
||||
D4 = symbols4 | nbBits4;
|
||||
} else {
|
||||
U64 const nbBits4 = nbBits * 0x0001000100010001ULL;
|
||||
U64 const symbols4 = _pdep_u64(MEM_read32(symbols), 0xFF00FF00FF00FF00ULL);
|
||||
D4 = symbols4 | nbBits4;
|
||||
}
|
||||
return D4;
|
||||
}
|
||||
|
||||
static U64 HUF_DEltX1_pack2(BYTE const* symbols, BYTE nbBits) {
|
||||
U64 D4;
|
||||
if (MEM_isLittleEndian()) {
|
||||
U64 const nbBits4 = nbBits * 0x0100010001000100ULL;
|
||||
U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x000000FF000000FFULL);
|
||||
symbols4 = symbols4 * 0x00010001ULL;
|
||||
D4 = symbols4 | nbBits4;
|
||||
} else {
|
||||
U64 const nbBits4 = nbBits * 0x0001000100010001ULL;
|
||||
U64 symbols4 = _pdep_u64(MEM_read16(symbols), 0x0000FF000000FF00ULL);
|
||||
symbols4 *= 0x00010001ULL;
|
||||
D4 = symbols4 | nbBits4;
|
||||
}
|
||||
return D4;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
||||
U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
|
||||
U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
|
||||
BYTE symbols[HUF_SYMBOLVALUE_MAX + 1];
|
||||
BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
|
||||
} HUF_ReadDTableX1_Workspace;
|
||||
|
||||
|
||||
// TODO: Template based on BMI2 (5% boost)
|
||||
size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
|
||||
{
|
||||
U32 tableLog = 0;
|
||||
|
@ -122,22 +186,15 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
|
|||
size_t iSize;
|
||||
void* const dtPtr = DTable + 1;
|
||||
HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
|
||||
HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace;
|
||||
|
||||
U32* rankVal;
|
||||
BYTE* huffWeight;
|
||||
size_t spaceUsed32 = 0;
|
||||
|
||||
rankVal = (U32 *)workSpace + spaceUsed32;
|
||||
spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
|
||||
huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
|
||||
spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
|
||||
|
||||
if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
|
||||
DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp));
|
||||
if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge);
|
||||
|
||||
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
|
||||
/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
|
||||
|
||||
iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
|
||||
iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp));
|
||||
if (HUF_isError(iSize)) return iSize;
|
||||
|
||||
/* Table header */
|
||||
|
@ -148,39 +205,103 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize
|
|||
memcpy(DTable, &dtd, sizeof(dtd));
|
||||
}
|
||||
|
||||
/* Calculate starting value for each rank */
|
||||
{ U32 n, nextRankStart = 0;
|
||||
for (n=1; n<tableLog+1; n++) {
|
||||
/* Compute symbols and rankStart given rankVal:
|
||||
*
|
||||
* rankVal already contains the number of values of each weight.
|
||||
*
|
||||
* symbols contains the symbols ordered by weight. First are the rankVal[0]
|
||||
* weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on.
|
||||
* symbols[0] is filled (but unused) to avoid a branch.
|
||||
*
|
||||
* rankStart contains the offset where each rank belongs in the DTable.
|
||||
* rankStart[0] is not filled because there are no entries in the table for
|
||||
* weight 0.
|
||||
*/
|
||||
{
|
||||
int n;
|
||||
int nextRankStart = 0;
|
||||
for (n=0; n<(int)tableLog+1; n++) {
|
||||
U32 const current = nextRankStart;
|
||||
nextRankStart += (rankVal[n] << (n-1));
|
||||
rankVal[n] = current;
|
||||
} }
|
||||
nextRankStart += wksp->rankVal[n];
|
||||
wksp->rankStart[n] = current;
|
||||
}
|
||||
// TODO: This loop is now the bottleneck: Can this be made faster?
|
||||
for (n=0; n < (int)nbSymbols; ++n) {
|
||||
size_t const w = wksp->huffWeight[n];
|
||||
wksp->symbols[wksp->rankStart[w]++] = n;
|
||||
}
|
||||
}
|
||||
|
||||
/* fill DTable */
|
||||
{ U32 n;
|
||||
size_t const nEnd = nbSymbols;
|
||||
for (n=0; n<nEnd; n++) {
|
||||
size_t const w = huffWeight[n];
|
||||
size_t const length = (1 << w) >> 1;
|
||||
size_t const uStart = rankVal[w];
|
||||
size_t const uEnd = uStart + length;
|
||||
size_t u;
|
||||
HUF_DEltX1 D;
|
||||
D.byte = (BYTE)n;
|
||||
D.nbBits = (BYTE)(tableLog + 1 - w);
|
||||
rankVal[w] = (U32)uEnd;
|
||||
if (length < 4) {
|
||||
/* Use length in the loop bound so the compiler knows it is short. */
|
||||
for (u = 0; u < length; ++u)
|
||||
dt[uStart + u] = D;
|
||||
} else {
|
||||
/* Unroll the loop 4 times, we know it is a power of 2. */
|
||||
for (u = uStart; u < uEnd; u += 4) {
|
||||
dt[u + 0] = D;
|
||||
dt[u + 1] = D;
|
||||
dt[u + 2] = D;
|
||||
dt[u + 3] = D;
|
||||
} } } }
|
||||
/* fill DTable
|
||||
* We fill all entries of each weight in order.
|
||||
* That way length is a constant for each iteration of the outter loop.
|
||||
* We can switch based on the length to a different inner loop which is
|
||||
* optimized for that particular case.
|
||||
*/
|
||||
{
|
||||
U32 w;
|
||||
int symbol=wksp->rankVal[0];
|
||||
int rankStart=0;
|
||||
for (w=1; w<tableLog+1; ++w) {
|
||||
int const symbolCount = wksp->rankVal[w];
|
||||
int const length = (1 << w) >> 1;
|
||||
int uStart = rankStart;
|
||||
BYTE const nbBits = tableLog + 1 - w;
|
||||
int s;
|
||||
int u;
|
||||
switch (length) {
|
||||
case 1:
|
||||
for (s=0; s<symbolCount; ++s) {
|
||||
HUF_DEltX1 D;
|
||||
D.byte = wksp->symbols[symbol + s];
|
||||
D.nbBits = nbBits;
|
||||
dt[uStart] = D;
|
||||
uStart += 1;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
for (s=0; s<symbolCount; ++s) {
|
||||
HUF_DEltX1 D;
|
||||
D.byte = wksp->symbols[symbol + s];
|
||||
D.nbBits = nbBits;
|
||||
dt[uStart+0] = D;
|
||||
dt[uStart+1] = D;
|
||||
uStart += 2;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
for (s=0; s<symbolCount; ++s) {
|
||||
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
|
||||
MEM_write64(dt + uStart, D4);
|
||||
uStart += 4;
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
for (s=0; s<symbolCount; ++s) {
|
||||
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
|
||||
MEM_write64(dt + uStart, D4);
|
||||
MEM_write64(dt + uStart + 4, D4);
|
||||
uStart += 8;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
for (s=0; s<symbolCount; ++s) {
|
||||
U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits);
|
||||
for (u=0; u < length; u += 16) {
|
||||
MEM_write64(dt + uStart + u + 0, D4);
|
||||
MEM_write64(dt + uStart + u + 4, D4);
|
||||
MEM_write64(dt + uStart + u + 8, D4);
|
||||
MEM_write64(dt + uStart + u + 12, D4);
|
||||
}
|
||||
assert(u == length);
|
||||
uStart += length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
symbol += symbolCount;
|
||||
rankStart += symbolCount * length;
|
||||
}
|
||||
}
|
||||
return iSize;
|
||||
}
|
||||
|
||||
|
|
|
@ -368,14 +368,17 @@ void
|
|||
ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
||||
const short* normalizedCounter, unsigned maxSymbolValue,
|
||||
const U32* baseValue, const U32* nbAdditionalBits,
|
||||
unsigned tableLog, U32* wksp, size_t wkspSize)
|
||||
unsigned tableLog, void* wksp, size_t wkspSize)
|
||||
{
|
||||
ZSTD_seqSymbol* const tableDecode = dt+1;
|
||||
U16 symbolNext[MaxSeq+1];
|
||||
|
||||
U32 const maxSV1 = maxSymbolValue + 1;
|
||||
U32 const tableSize = 1 << tableLog;
|
||||
|
||||
U16* symbolNext = (U16*)wksp;
|
||||
BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1);
|
||||
|
||||
assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE);
|
||||
|
||||
/* Sanity Checks */
|
||||
assert(maxSymbolValue <= MaxSeq);
|
||||
assert(tableLog <= MaxFSELog);
|
||||
|
@ -414,9 +417,6 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
|||
* all symbols have counts <= 8. We ensure we have 8 bytes at the end of
|
||||
* our buffer to handle the over-write.
|
||||
*/
|
||||
BYTE* spread = (BYTE*)wksp;
|
||||
assert(wkspSize >= (1u << MaxFSELog) + sizeof(U64));
|
||||
(void)wkspSize;
|
||||
{
|
||||
U64 const add = 0x0101010101010101ull;
|
||||
size_t pos = 0;
|
||||
|
|
|
@ -48,12 +48,14 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|||
* this function must be called with valid parameters only
|
||||
* (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
|
||||
* in which case it cannot fail.
|
||||
* The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes.
|
||||
* Internal use only.
|
||||
*/
|
||||
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
|
||||
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
|
||||
const short* normalizedCounter, unsigned maxSymbolValue,
|
||||
const U32* baseValue, const U32* nbAdditionalBits,
|
||||
unsigned tableLog, U32* wksp, size_t wkspSize);
|
||||
unsigned tableLog, void* wksp, size_t wkspSize);
|
||||
|
||||
|
||||
#endif /* ZSTD_DEC_BLOCK_H */
|
||||
|
|
Loading…
Reference in New Issue