Merge pull request #1411 from facebook/prefetch_dict

Improves decompression speed when using cold dictionary
2018-11-09 11:31:35 -08:00 · 2018-11-09 11:31:35 -08:00 · 7b0c551bff
commit 7b0c551bff
parent 1b4a9c518b 483759a3de
5 changed files with 21 additions and 25 deletions
--- a/contrib/largeNbDicts/Makefile
+++ b/contrib/largeNbDicts/Makefile
@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)

 .PHONY: $(LIBZSTD)
 $(LIBZSTD):
-	$(MAKE) -C $(LIBDIR) libzstd.a
+	$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"

 bench.o  : $(PROGDIR)/bench.c
 	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c

 clean:
 	$(RM) *.o
+	$(MAKE) -C $(LIBDIR) clean > /dev/null
 	$(RM) largeNbDicts
--- a/contrib/largeNbDicts/largeNbDicts.c
+++ b/contrib/largeNbDicts/largeNbDicts.c
@ -49,6 +49,7 @@


 /*---  Macros  ---*/
+
 #define CONTROL(c)   { if (!(c)) abort(); }
 #undef MIN
 #define MIN(a,b)     ((a) < (b) ? (a) : (b))
@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    if (blockSize)
        DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
    DISPLAYLEVEL(3, "\n");
+    size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);


    size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,

    /* dictionary determination */
    buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
-                                srcBuffer.ptr,
-                                srcSlices.capacities, nbBlocks,
+                                srcs.buffer.ptr,
+                                srcs.slices.capacities, srcs.slices.nbSlices,
                                DICTSIZE);
    CONTROL(dictBuffer.ptr != NULL);

@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    CONTROL(cTotalSizeNoDict != 0);
    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
                    clevel,
-                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+                    (double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);

    size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
    CONTROL(cSizes != NULL);
@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    CONTROL(cTotalSize != 0);
    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
                    (unsigned)dictBuffer.size,
-                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+                    (double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);

    /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
    shrinkSizes(dstSlices, cSizes);
--- a/lib/decompress/zstd_ddict.c
+++ b/lib/decompress/zstd_ddict.c
@ -15,7 +15,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@ -56,7 +56,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
        }
    }

-    /* prefetch dictionary content */
-    if (dctx->ddictIsCold) {
-        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
-        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
-        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
-        const void* const pStart = (const char*)dctx->dictEnd - pSize;
-        PREFETCH_AREA(pStart, pSize);
-        dctx->ddictIsCold = 0;
-    }
-
    return ip-istart;
 }

@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
        /* prepare in advance */
        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
        }
        if (seqNb<seqAdvance) return ERROR(corruption_detected);

@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(

        /* save reps for next block */
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
-#undef STORED_SEQS
-#undef STORED_SEQS_MASK
-#undef ADVANCED_SEQS
    }

    /* last literal segment */
@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
    }

    /* Build Decoding Tables */
-    {   int nbSeq;
+    {   int usePrefetchDecoder = dctx->ddictIsCold;
+        int nbSeq;
        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
        if (ZSTD_isError(seqHSize)) return seqHSize;
        ip += seqHSize;
        srcSize -= seqHSize;

-        if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>0) ) {  /* could probably use a larger nbSeq limit */
+        if ( !usePrefetchDecoder
+          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            if (shareLongOffsets >= minShare)
-                return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+            usePrefetchDecoder = (shareLongOffsets >= minShare);
        }

+        dctx->ddictIsCold = 0;
+
+        if (usePrefetchDecoder)
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+
+        /* else */
        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
    }
 }