Allow compressor to repeat Huffman tables

* Compressor saves most recently used Huffman table and reuses it
  if it produces better results.
* I attempted to preserve CPU usage profile.
  I intentionally left all of the existing heuristics in place.
  There is only a speed difference on the second block and later.
  When compressing large enough blocks (say >= 4 KiB) there is
  no significant difference in compression speed.
  Dictionary compression of one block is the same speed for blocks
  with literals <= 1 KiB, and after that the difference is not
  very significant.
* In the synthetic data, with blocks 10 KB or smaller, most blocks
  can't use repeated tables because the previous block did not
  contain a symbol that the current block contains.
  Once blocks are about 12 KB or more, most previous blocks have
  valid Huffman tables for the current block, and the compression
  ratio and decompression speed jumped.
* In silesia blocks as small as 4KB can frequently reuse the
  previous Huffman table (85%), but it isn't as profitable, and
  the previous Huffman table only gets used about 3% of the time.
* Microbenchmarks show that `HUF_validateCTable()` takes ~55 ns
  and `HUF_estimateCompressedSize()` takes ~35 ns.
  They are decently well optimized, the first versions took 90 ns
  and 120 ns respectively. `HUF_validateCTable()` could be twice as
  fast, if we cast the `HUF_CElt*` to a `U32*` and compare to 0.
  However, `U32` has an alignment of 4 instead of 2, so I think that
  might be undefined behavior.
* I've ran `zstreamtest` compiled normally, with UASAN and with MSAN
  for 4 hours each.

The worst case for the speed difference is a bunch of small blocks
in the same frame. I modified `bench.c` to compress the input in a
single frame but with blocks of the given block size, set by `-B`.
Benchmarks on level 1:

|  Program  | Block size |   Corpus  | Ratio | Compression MB/s | Decompression MB/s |
|-----------|------------|-----------|-------|------------------|--------------------|
| zstd.base |        256 | synthetic | 2.364 |            110.0 |              297.0 |
|      zstd |        256 | synthetic | 2.367 |            108.9 |              297.0 |
| zstd.base |        256 | silesia   | 2.204 |             93.8 |              415.7 |
|      zstd |        256 | silesia   | 2.204 |             93.4 |              415.7 |
| zstd.base |        512 | synthetic | 2.594 |            144.2 |              420.0 |
|      zstd |        512 | synthetic | 2.599 |            141.5 |              425.7 |
| zstd.base |        512 | silesia   | 2.358 |            118.4 |              432.6 |
|      zstd |        512 | silesia   | 2.358 |            119.8 |              432.6 |
| zstd.base |       1024 | synthetic | 2.790 |            192.3 |              594.1 |
|      zstd |       1024 | synthetic | 2.794 |            192.3 |              600.0 |
| zstd.base |       1024 | silesia   | 2.524 |            148.2 |              464.2 |
|      zstd |       1024 | silesia   | 2.525 |            148.2 |              467.6 |
| zstd.base |       4096 | synthetic | 3.023 |            300.0 |             1000.0 |
|      zstd |       4096 | synthetic | 3.024 |            300.0 |             1010.1 |
| zstd.base |       4096 | silesia   | 2.779 |            223.1 |              623.5 |
|      zstd |       4096 | silesia   | 2.779 |            223.1 |              636.0 |
| zstd.base |      16384 | synthetic | 3.131 |            350.0 |             1150.1 |
|      zstd |      16384 | synthetic | 3.152 |            350.0 |             1630.3 |
| zstd.base |      16384 | silesia   | 2.871 |            296.5 |              883.3 |
|      zstd |      16384 | silesia   | 2.872 |            294.4 |              898.3 |
This commit is contained in:
Nick Terrell 2017-03-01 17:51:56 -08:00
parent 31432cc57d
commit a419777eb1
3 changed files with 136 additions and 41 deletions

View File

@ -168,6 +168,16 @@ size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSym
size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
typedef enum {
HUF_repeat_none, /**< Cannot use the previous table */
HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */
} HUF_repeat;
/** HUF_compress4X_repeat() :
* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. */
size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat); /**< `workSpace` must be a table of at least 1024 unsigned */
/** HUF_buildCTable_wksp() :
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@ -216,6 +226,11 @@ size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* c
size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least 1024 unsigned */
size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
/** HUF_compress1X_repeat() :
* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. */
size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat); /**< `workSpace` must be a table of at least 1024 unsigned */
size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */

View File

@ -409,6 +409,25 @@ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U3
return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
}
static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
{
size_t nbBits = 0;
int s;
for (s = 0; s <= (int)maxSymbolValue; ++s) {
nbBits += CTable[s].nbBits * count[s];
}
return nbBits >> 3;
}
static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
int bad = 0;
int s;
for (s = 0; s <= (int)maxSymbolValue; ++s) {
bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
}
return !bad;
}
static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
{
BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
@ -510,22 +529,37 @@ size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, si
}
static size_t HUF_compressCTable_internal(
BYTE* const ostart, BYTE* op, BYTE* const oend,
const void* src, size_t srcSize,
unsigned singleStream, const HUF_CElt* CTable)
{
size_t const cSize = singleStream ?
HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) :
HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
if (HUF_isError(cSize)) { return cSize; }
if (cSize==0) { return 0; } /* uncompressible */
op += cSize;
/* check compressibility */
if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
return op-ostart;
}
/* `workSpace` must a table of at least 1024 unsigned */
static size_t HUF_compress_internal (
void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
unsigned singleStream,
void* workSpace, size_t wkspSize)
void* workSpace, size_t wkspSize, HUF_CElt* oldHufTable, HUF_repeat* repeat)
{
BYTE* const ostart = (BYTE*)dst;
BYTE* const oend = ostart + dstSize;
BYTE* op = ostart;
union {
U32 count[HUF_SYMBOLVALUE_MAX+1];
HUF_CElt CTable[HUF_SYMBOLVALUE_MAX+1];
} table; /* `count` can overlap with `CTable`; saves 1 KB */
U32 count[HUF_SYMBOLVALUE_MAX+1];
HUF_CElt CTable[HUF_SYMBOLVALUE_MAX+1];
/* checks & inits */
if (wkspSize < sizeof(huffNodeTable)) return ERROR(GENERIC);
@ -536,38 +570,51 @@ static size_t HUF_compress_internal (
if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
/* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
if (srcSize <= 1024 && repeat && *repeat == HUF_repeat_valid) {
return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
}
/* Scan input and build symbol stats */
{ CHECK_V_F(largest, FSE_count_wksp (table.count, &maxSymbolValue, (const BYTE*)src, srcSize, (U32*)workSpace) );
{ CHECK_V_F(largest, FSE_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, (U32*)workSpace) );
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */
}
/* Check validity of previous table */
if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
*repeat = HUF_repeat_none;
}
/* Heuristic : use existing table for small inputs */
if (srcSize <= 1024 && repeat && *repeat != HUF_repeat_none) {
return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
}
/* Build Huffman Tree */
huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
{ CHECK_V_F(maxBits, HUF_buildCTable_wksp (table.CTable, table.count, maxSymbolValue, huffLog, workSpace, wkspSize) );
{ CHECK_V_F(maxBits, HUF_buildCTable_wksp (CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize) );
huffLog = (U32)maxBits;
/* Zero the unused symbols so we can check it for validity */
memset(CTable + maxSymbolValue + 1, 0, sizeof(CTable) - (maxSymbolValue + 1) * sizeof(HUF_CElt));
}
/* Write table description header */
{ CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table.CTable, maxSymbolValue, huffLog) );
if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */
{ CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, CTable, maxSymbolValue, huffLog) );
/* Check if using the previous table will be beneficial */
if (repeat && *repeat != HUF_repeat_none) {
size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
}
}
/* Use the new table */
if (hSize + 12ul >= srcSize) { return 0; }
op += hSize;
if (repeat) { *repeat = HUF_repeat_none; }
if (oldHufTable) { memcpy(oldHufTable, CTable, sizeof(CTable)); } /* Save the new table */
}
/* Compress */
{ size_t const cSize = (singleStream) ?
HUF_compress1X_usingCTable(op, oend - op, src, srcSize, table.CTable) : /* single segment */
HUF_compress4X_usingCTable(op, oend - op, src, srcSize, table.CTable);
if (HUF_isError(cSize)) return cSize;
if (cSize==0) return 0; /* uncompressible */
op += cSize;
}
/* check compressibility */
if ((size_t)(op-ostart) >= srcSize-1)
return 0;
return op-ostart;
return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
}
@ -576,7 +623,16 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize)
{
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize);
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL);
}
size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat)
{
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat);
}
size_t HUF_compress1X (void* dst, size_t dstSize,
@ -592,7 +648,16 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize)
{
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize);
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL);
}
size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
HUF_CElt* hufTable, HUF_repeat* repeat)
{
return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat);
}
size_t HUF_compress2 (void* dst, size_t dstSize,

View File

@ -81,6 +81,7 @@ struct ZSTD_CCtx_s {
U32* chainTable;
HUF_CElt* hufTable;
U32 flagStaticTables;
HUF_repeat flagStaticHufTable;
FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
@ -254,8 +255,11 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
ZSTD_compResetPolicy_e const crp)
{
if (crp == ZSTDcrp_continue)
if (ZSTD_equivalentParams(params, zc->params))
if (ZSTD_equivalentParams(params, zc->params)) {
zc->flagStaticTables = 0;
zc->flagStaticHufTable = HUF_repeat_none;
return ZSTD_continueCCtx(zc, params, frameContentSize);
}
{ size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog);
U32 const divider = (params.cParams.searchLength==3) ? 3 : 4;
@ -289,6 +293,7 @@ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
ptr = zc->hashTable3 + h3Size;
zc->hufTable = (HUF_CElt*)ptr;
zc->flagStaticTables = 0;
zc->flagStaticHufTable = HUF_repeat_none;
ptr = ((U32*)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
zc->nextToUpdate = 1;
@ -374,12 +379,15 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long
/* copy entropy tables */
dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
if (srcCCtx->flagStaticTables) {
memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
}
if (srcCCtx->flagStaticHufTable) {
memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
}
return 0;
}
@ -493,24 +501,27 @@ static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc,
/* small ? don't even attempt compression (speed opt) */
# define LITERAL_NOENTROPY 63
{ size_t const minLitSize = zc->flagStaticTables ? 6 : LITERAL_NOENTROPY;
{ size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
}
if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall); /* not enough space for compression */
if (zc->flagStaticTables && (lhSize==3)) {
hType = set_repeat;
singleStream = 1;
cLitSize = HUF_compress1X_usingCTable(ostart+lhSize, dstCapacity-lhSize, src, srcSize, zc->hufTable);
} else {
cLitSize = singleStream ? HUF_compress1X_wksp(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters))
: HUF_compress4X_wksp(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters));
{ HUF_repeat repeat = zc->flagStaticHufTable;
if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat)
: HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat);
if (repeat != HUF_repeat_none) { hType = set_repeat; } /* reused the existing table */
else { zc->flagStaticHufTable = HUF_repeat_check; } /* now have a table to reuse */
}
if ((cLitSize==0) | (cLitSize >= srcSize - minGain))
if ((cLitSize==0) | (cLitSize >= srcSize - minGain)) {
zc->flagStaticHufTable = HUF_repeat_none;
return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
if (cLitSize==1)
}
if (cLitSize==1) {
zc->flagStaticHufTable = HUF_repeat_none;
return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
}
/* Build header */
switch(lhSize)
@ -753,9 +764,12 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc,
/* check compressibility */
_check_compressibility:
{ size_t const minGain = ZSTD_minGain(srcSize);
size_t const maxCSize = srcSize - minGain;
if ((size_t)(op-ostart) >= maxCSize) return 0; }
{ size_t const minGain = ZSTD_minGain(srcSize);
size_t const maxCSize = srcSize - minGain;
if ((size_t)(op-ostart) >= maxCSize) {
zc->flagStaticHufTable = HUF_repeat_none;
return 0;
} }
/* confirm repcodes */
{ int i; for (i=0; i<ZSTD_REP_NUM; i++) zc->rep[i] = zc->repToConfirm[i]; }
@ -2606,6 +2620,7 @@ static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_
}
cctx->flagStaticTables = 1;
cctx->flagStaticHufTable = HUF_repeat_valid;
return dictPtr - (const BYTE*)dict;
}