Merge pull request #1411 from facebook/prefetch_dict

Improves decompression speed when using cold dictionary
dev
Yann Collet 2018-11-09 11:31:35 -08:00 committed by GitHub
commit 7b0c551bff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 25 deletions

View File

@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
.PHONY: $(LIBZSTD)
$(LIBZSTD):
$(MAKE) -C $(LIBDIR) libzstd.a
$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"
bench.o : $(PROGDIR)/bench.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c
clean:
$(RM) *.o
$(MAKE) -C $(LIBDIR) clean > /dev/null
$(RM) largeNbDicts

View File

@ -49,6 +49,7 @@
/*--- Macros ---*/
#define CONTROL(c) { if (!(c)) abort(); }
#undef MIN
#define MIN(a,b) ((a) < (b) ? (a) : (b))
@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
if (blockSize)
DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
DISPLAYLEVEL(3, "\n");
size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);
size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,
/* dictionary determination */
buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
srcBuffer.ptr,
srcSlices.capacities, nbBlocks,
srcs.buffer.ptr,
srcs.slices.capacities, srcs.slices.nbSlices,
DICTSIZE);
CONTROL(dictBuffer.ptr != NULL);
@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
CONTROL(cTotalSizeNoDict != 0);
DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f (%u bytes) \n",
clevel,
(double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
(double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
CONTROL(cSizes != NULL);
@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
CONTROL(cTotalSize != 0);
DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f (%u bytes) \n",
(unsigned)dictBuffer.size,
(double)srcSize / cTotalSize, (unsigned)cTotalSize);
(double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);
/* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
shrinkSizes(dstSlices, cSizes);

View File

@ -15,7 +15,6 @@
* Dependencies
*********************************************************/
#include <string.h> /* memcpy, memmove, memset */
#include "compiler.h" /* prefetch */
#include "cpu.h" /* bmi2 */
#include "mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY

View File

@ -56,7 +56,6 @@
* Dependencies
*********************************************************/
#include <string.h> /* memcpy, memmove, memset */
#include "compiler.h" /* prefetch */
#include "cpu.h" /* bmi2 */
#include "mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY

View File

@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
}
}
/* prefetch dictionary content */
if (dctx->ddictIsCold) {
size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
size_t const pSize = MIN(psmin, 128 KB /* protection */ );
const void* const pStart = (const char*)dctx->dictEnd - pSize;
PREFETCH_AREA(pStart, pSize);
dctx->ddictIsCold = 0;
}
return ip-istart;
}
@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
/* prepare in advance */
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
}
if (seqNb<seqAdvance) return ERROR(corruption_detected);
@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(
/* save reps for next block */
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
#undef STORED_SEQS
#undef STORED_SEQS_MASK
#undef ADVANCED_SEQS
}
/* last literal segment */
@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
}
/* Build Decoding Tables */
{ int nbSeq;
{ int usePrefetchDecoder = dctx->ddictIsCold;
int nbSeq;
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
if (ZSTD_isError(seqHSize)) return seqHSize;
ip += seqHSize;
srcSize -= seqHSize;
if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
&& (nbSeq>0) ) { /* could probably use a larger nbSeq limit */
if ( !usePrefetchDecoder
&& (!frame || (dctx->fParams.windowSize > (1<<24)))
&& (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
if (shareLongOffsets >= minShare)
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
usePrefetchDecoder = (shareLongOffsets >= minShare);
}
dctx->ddictIsCold = 0;
if (usePrefetchDecoder)
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
/* else */
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
}
}