[HUF] Improve Huffman encoding speed
Improve Huffman encoding speed by 20% for gcc and 10% for clang. | Compiler | Benchmark | Config | Dataset | Ratio | Speed MB/s (dev) | Speed MB/s (huf-cspeed) | Speed MB/s (huf-cspeed - dev) | |----------|-------------------|---------|-------------|-------|------------------|-------------------------|-------------------------------| | gcc | compress | level_1 | enwik7 | 2.43 | 253.70 | 258.72 | 2.0% | | gcc | compress | level_1 | silesia | 2.88 | 341.90 | 348.15 | 1.8% | | gcc | compress_literals | level_1 | enwik7 | 1.49 | 761.83 | 912.76 | 19.8% | | gcc | compress_literals | level_1 | silesia | 1.28 | 754.83 | 902.37 | 19.5% | | gcc | compress_literals | level_7 | enwik7 | 1.29 | 502.81 | 552.79 | 9.9% | | gcc | compress_literals | level_7 | silesia | 1.11 | 675.97 | 776.44 | 14.9% | | clang | compress | level_1 | enwik7 | 2.43 | 277.54 | 280.98 | 1.2% | | clang | compress | level_1 | silesia | 2.88 | 369.98 | 375.46 | 1.5% | | clang | compress_literals | level_1 | enwik7 | 1.49 | 828.83 | 918.41 | 10.8% | | clang | compress_literals | level_1 | silesia | 1.28 | 815.81 | 905.41 | 11.0% | | clang | compress_literals | level_7 | enwik7 | 1.29 | 533.13 | 553.30 | 3.8% | | clang | compress_literals | level_7 | silesia | 1.11 | 714.52 | 775.38 | 8.5% |
This commit is contained in:
parent
b3e372c171
commit
46f2710562
@ -89,9 +89,9 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
|
||||
|
||||
/** HUF_compress4X_wksp() :
|
||||
* Same as HUF_compress2(), but uses externally allocated `workSpace`.
|
||||
* `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
|
||||
#define HUF_WORKSPACE_SIZE ((6 << 10) + 256)
|
||||
#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
|
||||
* `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
|
||||
#define HUF_WORKSPACE_SIZE ((8 << 10) + 256)
|
||||
#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
|
||||
HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
|
||||
const void* src, size_t srcSize,
|
||||
unsigned maxSymbolValue, unsigned tableLog,
|
||||
@ -136,15 +136,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
|
||||
|
||||
/* static allocation of HUF's Compression Table */
|
||||
/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
|
||||
struct HUF_CElt_s {
|
||||
U16 val;
|
||||
BYTE nbBits;
|
||||
}; /* typedef'd to HUF_CElt */
|
||||
typedef struct HUF_CElt_s HUF_CElt; /* consider it an incomplete type */
|
||||
#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */
|
||||
#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
|
||||
typedef size_t HUF_CElt; /* consider it an incomplete type */
|
||||
#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use tables of size_t, for proper alignment */
|
||||
#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
|
||||
#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
|
||||
HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */
|
||||
HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
|
||||
|
||||
/* static allocation of HUF's DTable */
|
||||
typedef U32 HUF_DTable;
|
||||
@ -250,11 +246,10 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
|
||||
* Loading a CTable saved with HUF_writeCTable() */
|
||||
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
|
||||
|
||||
/** HUF_getNbBits() :
|
||||
/** HUF_getNbBitsFromCTable() :
|
||||
* Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
|
||||
* Note 1 : is not inlined, as HUF_CElt definition is private
|
||||
* Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
|
||||
U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
|
||||
* Note 1 : is not inlined, as HUF_CElt definition is private */
|
||||
U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
|
||||
|
||||
/*
|
||||
* HUF_decompress() does the following:
|
||||
@ -306,7 +301,7 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* c
|
||||
/* ====================== */
|
||||
|
||||
size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
|
||||
size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
|
||||
size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
|
||||
size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
|
||||
/** HUF_compress1X_repeat() :
|
||||
* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
|
||||
|
@ -53,6 +53,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
|
||||
/* *******************************************************
|
||||
* HUF : Huffman block compression
|
||||
*********************************************************/
|
||||
#define HUF_WORKSPACE_MAX_ALIGNMENT 8
|
||||
|
||||
static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
|
||||
{
|
||||
size_t const mask = align - 1;
|
||||
size_t const rem = (size_t)workspace & mask;
|
||||
size_t const add = (align - rem) & mask;
|
||||
BYTE* const aligned = (BYTE*)workspace + add;
|
||||
assert((align & (align - 1)) == 0); /* pow 2 */
|
||||
assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
|
||||
if (*workspaceSizePtr >= add) {
|
||||
assert(add < align);
|
||||
assert(((size_t)aligned & mask) == 0);
|
||||
*workspaceSizePtr -= add;
|
||||
return aligned;
|
||||
} else {
|
||||
*workspaceSizePtr = 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* HUF_compressWeights() :
|
||||
* Same as FSE_compress(), but dedicated to huff0's weights compression.
|
||||
* The use case needs much less stack memory.
|
||||
@ -75,7 +97,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
|
||||
|
||||
unsigned maxSymbolValue = HUF_TABLELOG_MAX;
|
||||
U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
|
||||
HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace;
|
||||
HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, sizeof(U32));
|
||||
|
||||
if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
|
||||
|
||||
@ -106,6 +128,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
|
||||
return (size_t)(op-ostart);
|
||||
}
|
||||
|
||||
static size_t HUF_getNbBits(HUF_CElt elt)
|
||||
{
|
||||
return elt & 0xFF;
|
||||
}
|
||||
|
||||
static size_t HUF_getNbBitsFast(HUF_CElt elt)
|
||||
{
|
||||
return elt;
|
||||
}
|
||||
|
||||
static size_t HUF_getValue(HUF_CElt elt)
|
||||
{
|
||||
return elt & ~0xFF;
|
||||
}
|
||||
|
||||
static size_t HUF_getValueFast(HUF_CElt elt)
|
||||
{
|
||||
return elt;
|
||||
}
|
||||
|
||||
static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
|
||||
{
|
||||
assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
|
||||
*elt = nbBits;
|
||||
}
|
||||
|
||||
static void HUF_setValue(HUF_CElt* elt, size_t value)
|
||||
{
|
||||
size_t const nbBits = HUF_getNbBits(*elt);
|
||||
if (nbBits > 0) {
|
||||
assert((value >> nbBits) == 0);
|
||||
*elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
HUF_CompressWeightsWksp wksp;
|
||||
@ -117,9 +173,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
|
||||
const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
|
||||
void* workspace, size_t workspaceSize)
|
||||
{
|
||||
HUF_CElt const* const ct = CTable + 1;
|
||||
BYTE* op = (BYTE*)dst;
|
||||
U32 n;
|
||||
HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace;
|
||||
HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, sizeof(U32));
|
||||
|
||||
/* check conditions */
|
||||
if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
|
||||
@ -130,7 +187,7 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
|
||||
for (n=1; n<huffLog+1; n++)
|
||||
wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
|
||||
for (n=0; n<maxSymbolValue; n++)
|
||||
wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits];
|
||||
wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
|
||||
|
||||
/* attempt weights compression by FSE */
|
||||
if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
|
||||
@ -167,6 +224,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
|
||||
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
|
||||
U32 tableLog = 0;
|
||||
U32 nbSymbols = 0;
|
||||
HUF_CElt* const ct = CTable + 1;
|
||||
|
||||
/* get symbol weights */
|
||||
CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
|
||||
@ -176,6 +234,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
|
||||
if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
|
||||
if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
|
||||
|
||||
CTable[0] = tableLog;
|
||||
|
||||
/* Prepare base value per rank */
|
||||
{ U32 n, nextRankStart = 0;
|
||||
for (n=1; n<=tableLog; n++) {
|
||||
@ -187,13 +247,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
|
||||
/* fill nbBits */
|
||||
{ U32 n; for (n=0; n<nbSymbols; n++) {
|
||||
const U32 w = huffWeight[n];
|
||||
CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
|
||||
HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
|
||||
} }
|
||||
|
||||
/* fill val */
|
||||
{ U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
|
||||
U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
|
||||
{ U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
|
||||
{ U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
|
||||
/* determine stating value per rank */
|
||||
valPerRank[tableLog+1] = 0; /* for w==0 */
|
||||
{ U16 min = 0;
|
||||
@ -203,18 +263,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
|
||||
min >>= 1;
|
||||
} }
|
||||
/* assign value within rank, symbol order */
|
||||
{ U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
|
||||
{ U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
|
||||
}
|
||||
|
||||
*maxSymbolValuePtr = nbSymbols - 1;
|
||||
return readSize;
|
||||
}
|
||||
|
||||
U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
|
||||
U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
|
||||
{
|
||||
const HUF_CElt* table = (const HUF_CElt*)symbolTable;
|
||||
const HUF_CElt* ct = CTable + 1;
|
||||
assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
|
||||
return table[symbolValue].nbBits;
|
||||
return (U32)HUF_getNbBits(ct[symbolValue]);
|
||||
}
|
||||
|
||||
|
||||
@ -491,6 +551,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
|
||||
*/
|
||||
static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
|
||||
{
|
||||
HUF_CElt* const ct = CTable + 1;
|
||||
/* fill result into ctable (val, nbBits) */
|
||||
int n;
|
||||
U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
|
||||
@ -506,20 +567,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
|
||||
min >>= 1;
|
||||
} }
|
||||
for (n=0; n<alphabetSize; n++)
|
||||
CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
|
||||
HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
|
||||
for (n=0; n<alphabetSize; n++)
|
||||
CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */
|
||||
HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
|
||||
CTable[0] = maxNbBits;
|
||||
}
|
||||
|
||||
size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
|
||||
size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
|
||||
{
|
||||
HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
|
||||
HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, sizeof(U32));
|
||||
nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
|
||||
nodeElt* const huffNode = huffNode0+1;
|
||||
int nonNullRank;
|
||||
|
||||
/* safety checks */
|
||||
if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
|
||||
if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
|
||||
return ERROR(workSpace_tooSmall);
|
||||
if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
|
||||
@ -537,91 +598,327 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo
|
||||
maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
|
||||
if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
|
||||
|
||||
HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
|
||||
HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
|
||||
|
||||
return maxNbBits;
|
||||
}
|
||||
|
||||
size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
|
||||
{
|
||||
HUF_CElt const* ct = CTable + 1;
|
||||
size_t nbBits = 0;
|
||||
int s;
|
||||
for (s = 0; s <= (int)maxSymbolValue; ++s) {
|
||||
nbBits += CTable[s].nbBits * count[s];
|
||||
nbBits += HUF_getNbBits(ct[s]) * count[s];
|
||||
}
|
||||
return nbBits >> 3;
|
||||
}
|
||||
|
||||
int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
|
||||
HUF_CElt const* ct = CTable + 1;
|
||||
int bad = 0;
|
||||
int s;
|
||||
for (s = 0; s <= (int)maxSymbolValue; ++s) {
|
||||
bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
|
||||
bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
|
||||
}
|
||||
return !bad;
|
||||
}
|
||||
|
||||
size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
|
||||
|
||||
FORCE_INLINE_TEMPLATE void
|
||||
HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
|
||||
/** HUF_CStream_t:
|
||||
* Huffman uses its own BIT_CStream_t implementation.
|
||||
* There are three major differences from BIT_CStream_t:
|
||||
* 1. HUF_addBits() takes a HUF_CElt (size_t) which is
|
||||
* the pair (nbBits, value) in the format:
|
||||
* format:
|
||||
* - Bits [0, 4) = nbBits
|
||||
* - Bits [4, 64 - nbBits) = 0
|
||||
* - Bits [64 - nbBits, 64) = value
|
||||
* 2. The bitContainer is built from the upper bits and
|
||||
* right shifted. E.g. to add a new value of N bits
|
||||
* you right shift the bitContainer by N, then or in
|
||||
* the new value into the N upper bits.
|
||||
* 3. The bitstream has two bit containers. You can add
|
||||
* bits to the second container and merge them into
|
||||
* the first container.
|
||||
*/
|
||||
|
||||
#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
|
||||
|
||||
typedef struct {
|
||||
size_t bitContainer[2];
|
||||
size_t bitPos[2];
|
||||
|
||||
BYTE* startPtr;
|
||||
BYTE* ptr;
|
||||
BYTE* endPtr;
|
||||
} HUF_CStream_t;
|
||||
|
||||
/**! HUF_initCStream():
|
||||
* Initializes the bistream.
|
||||
* @returns 0 or an error code.
|
||||
*/
|
||||
static size_t HUF_initCStream(HUF_CStream_t* bitC,
|
||||
void* startPtr, size_t dstCapacity)
|
||||
{
|
||||
BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
|
||||
ZSTD_memset(bitC, 0, sizeof(*bitC));
|
||||
bitC->startPtr = (BYTE*)startPtr;
|
||||
bitC->ptr = bitC->startPtr;
|
||||
bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
|
||||
if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define HUF_FLUSHBITS(s) BIT_flushBits(s)
|
||||
/*! HUF_addBits():
|
||||
* Adds the symbol stored in HUF_CElt elt to the bitstream.
|
||||
*
|
||||
* @param elt The element we're adding. This is a (nbBits, value) pair.
|
||||
* See the HUF_CStream_t docs for the format.
|
||||
* @param idx Insert into the bistream at this idx.
|
||||
* @param kFast This is a template parameter. If the bitstream is guaranteed
|
||||
* to have at least 4 unused bits after this call it may be 1,
|
||||
* otherwise it must be 0. HUF_addBits() is faster when fast is set.
|
||||
*/
|
||||
FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
|
||||
{
|
||||
assert(idx <= 1);
|
||||
assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
|
||||
/* This is efficient on x86-64 with BMI2 because shrx
|
||||
* only reads the low 6 bits of the register. The compiler
|
||||
* knows this and elides the mask. When fast is set,
|
||||
* every operation can use the same value loaded from elt.
|
||||
*/
|
||||
bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
|
||||
bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
|
||||
/* We only read the low 8 bits of bitC->bitPos[idx] so it
|
||||
* doesn't matter that the high bits have noise from the value.
|
||||
*/
|
||||
bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
|
||||
assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
|
||||
/* The last 4-bits of elt are dirty if fast is set,
|
||||
* so we must not be overwriting bits that have already been
|
||||
* inserted into the bit container.
|
||||
*/
|
||||
#if DEBUGLEVEL >= 1
|
||||
{
|
||||
size_t const nbBits = HUF_getNbBits(elt);
|
||||
size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
|
||||
/* Middle bits are 0. */
|
||||
assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
|
||||
/* We didn't overwrite any bits in the bit container. */
|
||||
assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define HUF_FLUSHBITS_1(stream) \
|
||||
if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
|
||||
FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
|
||||
{
|
||||
bitC->bitContainer[1] = 0;
|
||||
bitC->bitPos[1] = 0;
|
||||
}
|
||||
|
||||
/*! HUF_mergeIndex1() :
|
||||
* Merges the bit container @ index 1 into the bit container @ index 0
|
||||
* and zeros the bit container @ index 1.
|
||||
*/
|
||||
FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
|
||||
{
|
||||
assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
|
||||
bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
|
||||
bitC->bitContainer[0] |= bitC->bitContainer[1];
|
||||
bitC->bitPos[0] += bitC->bitPos[1];
|
||||
assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
|
||||
}
|
||||
|
||||
/*! HUF_flushBits() :
|
||||
* Flushes the bits in the bit container @ index 0.
|
||||
*
|
||||
* @post bitPos will be < 8.
|
||||
* @param kFast If kFast is set then we must know a-priori that
|
||||
* the bit container will not overflow.
|
||||
*/
|
||||
FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
|
||||
{
|
||||
/* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
|
||||
size_t const nbBits = bitC->bitPos[0] & 0xFF;
|
||||
size_t const nbBytes = nbBits >> 3;
|
||||
/* The top nbBits bits of bitContainer are the ones we need. */
|
||||
size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
|
||||
/* Mask bitPos to account for the bytes we consumed. */
|
||||
bitC->bitPos[0] &= 7;
|
||||
assert(nbBits > 0);
|
||||
assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
|
||||
assert(bitC->ptr <= bitC->endPtr);
|
||||
MEM_writeLEST(bitC->ptr, bitContainer);
|
||||
bitC->ptr += nbBytes;
|
||||
assert(!kFast || bitC->ptr <= bitC->endPtr);
|
||||
if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
|
||||
/* bitContainer doesn't need to be modified because the leftover
|
||||
* bits are already the top bitPos bits. And we don't care about
|
||||
* noise in the lower values.
|
||||
*/
|
||||
}
|
||||
|
||||
/*! HUF_endMark()
|
||||
* @returns The Huffman stream end mark: A 1-bit value = 1.
|
||||
*/
|
||||
static HUF_CElt HUF_endMark(void)
|
||||
{
|
||||
HUF_CElt endMark;
|
||||
HUF_setNbBits(&endMark, 1);
|
||||
HUF_setValue(&endMark, 1);
|
||||
return endMark;
|
||||
}
|
||||
|
||||
/*! HUF_closeCStream() :
|
||||
* @return Size of CStream, in bytes,
|
||||
* or 0 if it could not fit into dstBuffer */
|
||||
static size_t HUF_closeCStream(HUF_CStream_t* bitC)
|
||||
{
|
||||
HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
|
||||
HUF_flushBits(bitC, /* kFast */ 0);
|
||||
{
|
||||
size_t const nbBits = bitC->bitPos[0] & 0xFF;
|
||||
if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
|
||||
return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
|
||||
}
|
||||
}
|
||||
|
||||
FORCE_INLINE_TEMPLATE void
|
||||
HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
|
||||
{
|
||||
HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
|
||||
}
|
||||
|
||||
FORCE_INLINE_TEMPLATE void
|
||||
HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
|
||||
const BYTE* ip, size_t srcSize,
|
||||
const HUF_CElt* ct,
|
||||
int kUnroll, int kFastFlush, int kLastFast)
|
||||
{
|
||||
/* Join to kUnroll */
|
||||
int n = (int)srcSize;
|
||||
int rem = n % kUnroll;
|
||||
if (rem > 0) {
|
||||
for (; rem > 0; --rem) {
|
||||
HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
|
||||
}
|
||||
HUF_flushBits(bitC, kFastFlush);
|
||||
}
|
||||
assert(n % kUnroll == 0);
|
||||
|
||||
/* Join to 2 * kUnroll */
|
||||
if (n % (2 * kUnroll)) {
|
||||
int u;
|
||||
for (u = 1; u < kUnroll; ++u) {
|
||||
HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
|
||||
}
|
||||
HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
|
||||
HUF_flushBits(bitC, kFastFlush);
|
||||
n -= kUnroll;
|
||||
}
|
||||
assert(n % (2 * kUnroll) == 0);
|
||||
|
||||
for (; n>0; n-= 2 * kUnroll) {
|
||||
/* Encode kUnroll symbols into the bitstream @ index 0. */
|
||||
int u;
|
||||
for (u = 1; u < kUnroll; ++u) {
|
||||
HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
|
||||
}
|
||||
HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
|
||||
HUF_flushBits(bitC, kFastFlush);
|
||||
/* Encode kUnroll symbols into the bitstream @ index 1.
|
||||
* This allows us to start filling the bit container
|
||||
* without any data dependencies.
|
||||
*/
|
||||
HUF_zeroIndex1(bitC);
|
||||
for (u = 1; u < kUnroll; ++u) {
|
||||
HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
|
||||
}
|
||||
HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
|
||||
/* Merge bitstream @ index 1 into the bitstream @ index 0 */
|
||||
HUF_mergeIndex1(bitC);
|
||||
HUF_flushBits(bitC, kFastFlush);
|
||||
}
|
||||
assert(n == 0);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a tight upper bound on the output space needed by Huffman
|
||||
* with 8 bytes buffer to handle over-writes. If the output is at least
|
||||
* this large we don't need to do bounds checks during Huffman encoding.
|
||||
*/
|
||||
static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
|
||||
{
|
||||
return ((srcSize * tableLog) >> 3) + 8;
|
||||
}
|
||||
|
||||
#define HUF_FLUSHBITS_2(stream) \
|
||||
if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
|
||||
|
||||
FORCE_INLINE_TEMPLATE size_t
|
||||
HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
|
||||
const void* src, size_t srcSize,
|
||||
const HUF_CElt* CTable)
|
||||
{
|
||||
U32 const tableLog = (U32)CTable[0];
|
||||
HUF_CElt const* ct = CTable + 1;
|
||||
const BYTE* ip = (const BYTE*) src;
|
||||
BYTE* const ostart = (BYTE*)dst;
|
||||
BYTE* const oend = ostart + dstSize;
|
||||
BYTE* op = ostart;
|
||||
size_t n;
|
||||
BIT_CStream_t bitC;
|
||||
HUF_CStream_t bitC;
|
||||
|
||||
/* init */
|
||||
if (dstSize < 8) return 0; /* not enough space to compress */
|
||||
{ size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
|
||||
{ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
|
||||
if (HUF_isError(initErr)) return 0; }
|
||||
|
||||
n = srcSize & ~3; /* join to mod 4 */
|
||||
switch (srcSize & 3)
|
||||
{
|
||||
case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
|
||||
HUF_FLUSHBITS_2(&bitC);
|
||||
/* fall-through */
|
||||
case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
|
||||
HUF_FLUSHBITS_1(&bitC);
|
||||
/* fall-through */
|
||||
case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
|
||||
HUF_FLUSHBITS(&bitC);
|
||||
/* fall-through */
|
||||
case 0 : /* fall-through */
|
||||
default: break;
|
||||
if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
|
||||
else {
|
||||
if (MEM_32bits()) {
|
||||
switch (tableLog) {
|
||||
case 11:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
|
||||
break;
|
||||
case 10:
|
||||
case 9:
|
||||
case 8:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
|
||||
break;
|
||||
case 7:
|
||||
default:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (tableLog) {
|
||||
case 11:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
|
||||
break;
|
||||
case 10:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
|
||||
break;
|
||||
case 9:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
|
||||
break;
|
||||
case 8:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
|
||||
break;
|
||||
case 7:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
|
||||
break;
|
||||
case 6:
|
||||
default:
|
||||
HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(bitC.ptr <= bitC.endPtr);
|
||||
|
||||
for (; n>0; n-=4) { /* note : n&3==0 at this stage */
|
||||
HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
|
||||
HUF_FLUSHBITS_1(&bitC);
|
||||
HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
|
||||
HUF_FLUSHBITS_2(&bitC);
|
||||
HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
|
||||
HUF_FLUSHBITS_1(&bitC);
|
||||
HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
|
||||
HUF_FLUSHBITS(&bitC);
|
||||
}
|
||||
|
||||
return BIT_closeCStream(&bitC);
|
||||
return HUF_closeCStream(&bitC);
|
||||
}
|
||||
|
||||
#if DYNAMIC_BMI2
|
||||
@ -671,7 +968,6 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si
|
||||
return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
|
||||
const void* src, size_t srcSize,
|
||||
@ -751,10 +1047,11 @@ static size_t HUF_compressCTable_internal(
|
||||
|
||||
typedef struct {
|
||||
unsigned count[HUF_SYMBOLVALUE_MAX + 1];
|
||||
HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
|
||||
HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
|
||||
union {
|
||||
HUF_buildCTable_wksp_tables buildCTable_wksp;
|
||||
HUF_WriteCTableWksp writeCTable_wksp;
|
||||
U32 hist_wksp[HIST_WKSP_SIZE_U32];
|
||||
} wksps;
|
||||
} HUF_compress_tables_t;
|
||||
|
||||
@ -763,26 +1060,25 @@ typedef struct {
|
||||
|
||||
/* HUF_compress_internal() :
|
||||
* `workSpace_align4` must be aligned on 4-bytes boundaries,
|
||||
* and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
|
||||
* and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
|
||||
static size_t
|
||||
HUF_compress_internal (void* dst, size_t dstSize,
|
||||
const void* src, size_t srcSize,
|
||||
unsigned maxSymbolValue, unsigned huffLog,
|
||||
HUF_nbStreams_e nbStreams,
|
||||
void* workSpace_align4, size_t wkspSize,
|
||||
void* workSpace, size_t wkspSize,
|
||||
HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
|
||||
const int bmi2, unsigned suspectUncompressible)
|
||||
{
|
||||
HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
|
||||
HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, sizeof(size_t));
|
||||
BYTE* const ostart = (BYTE*)dst;
|
||||
BYTE* const oend = ostart + dstSize;
|
||||
BYTE* op = ostart;
|
||||
|
||||
HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
|
||||
assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */
|
||||
HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
|
||||
|
||||
/* checks & inits */
|
||||
if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
|
||||
if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
|
||||
if (!srcSize) return 0; /* Uncompressed */
|
||||
if (!dstSize) return 0; /* cannot fit anything within dst budget */
|
||||
if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
|
||||
@ -814,7 +1110,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
||||
}
|
||||
|
||||
/* Scan input and build symbol stats */
|
||||
{ CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
|
||||
{ CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
|
||||
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
|
||||
if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
|
||||
}
|
||||
@ -839,9 +1135,12 @@ HUF_compress_internal (void* dst, size_t dstSize,
|
||||
&table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
|
||||
CHECK_F(maxBits);
|
||||
huffLog = (U32)maxBits;
|
||||
/* Zero unused symbols in CTable, so we can check it for validity */
|
||||
ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
|
||||
sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
|
||||
}
|
||||
/* Zero unused symbols in CTable, so we can check it for validity */
|
||||
{
|
||||
size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
|
||||
size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
|
||||
ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
|
||||
}
|
||||
|
||||
/* Write table description header */
|
||||
@ -939,7 +1238,7 @@ size_t HUF_compress1X (void* dst, size_t dstSize,
|
||||
const void* src, size_t srcSize,
|
||||
unsigned maxSymbolValue, unsigned huffLog)
|
||||
{
|
||||
unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
|
||||
U64 workSpace[HUF_WORKSPACE_SIZE_U64];
|
||||
return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
|
||||
}
|
||||
|
||||
@ -947,7 +1246,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
|
||||
const void* src, size_t srcSize,
|
||||
unsigned maxSymbolValue, unsigned huffLog)
|
||||
{
|
||||
unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
|
||||
U64 workSpace[HUF_WORKSPACE_SIZE_U64];
|
||||
return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ typedef struct {
|
||||
} ZSTD_localDict;
|
||||
|
||||
typedef struct {
|
||||
HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)];
|
||||
HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
|
||||
HUF_repeat repeatMode;
|
||||
} ZSTD_hufCTables_t;
|
||||
|
||||
|
@ -126,7 +126,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
|
||||
optPtr->litSum = 0;
|
||||
for (lit=0; lit<=MaxLit; lit++) {
|
||||
U32 const scaleLog = 11; /* scale to 2K */
|
||||
U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
|
||||
U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);
|
||||
assert(bitCost <= scaleLog);
|
||||
optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
|
||||
optPtr->litSum += optPtr->litFreq[lit];
|
||||
|
@ -185,7 +185,7 @@ BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX];
|
||||
BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX];
|
||||
BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX];
|
||||
|
||||
unsigned WKSP[HUF_WORKSPACE_SIZE_U32];
|
||||
U64 WKSP[HUF_WORKSPACE_SIZE_U64];
|
||||
|
||||
typedef struct {
|
||||
size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */
|
||||
@ -199,7 +199,7 @@ typedef struct {
|
||||
int hufInit;
|
||||
/* the distribution used in the previous block for repeat mode */
|
||||
BYTE hufDist[DISTSIZE];
|
||||
HUF_CElt hufTable [256];
|
||||
HUF_CElt hufTable [HUF_CTABLE_SIZE_ST(255)];
|
||||
|
||||
int fseInit;
|
||||
FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
|
||||
|
Loading…
x
Reference in New Issue
Block a user