Merged ZSTD_preserveUnsortedMark() into ZSTD_reduceIndex()

as it's faster, due to one memory scan instead of two (confirmed by microbenchmark). Note : as ZSTD_reduceIndex() is rarely invoked, it does not translate into a visible gain. Consider it an exercise in auto-vectorization and micro-benchmarking.
2018-02-07 14:22:35 -08:00 · 2018-02-07 14:22:35 -08:00 · de68c2ff10
commit de68c2ff10
parent 0170cf9a7a
3 changed files with 32 additions and 55 deletions
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@ -1223,32 +1223,44 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long


 #define ZSTD_ROWSIZE 16
-/*! ZSTD_reduceTable_internal() :
- *  reduce table indexes by `reducerValue`
- *  presume table size is a multiple of ZSTD_ROWSIZE.
- *  Helps auto-vectorization */
-static void ZSTD_reduceTable_internal (U32* const table, int const nbRows, U32 const reducerValue)
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
 {
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
    int cellNb = 0;
    int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
        int column;
        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
            if (table[cellNb] < reducerValue) table[cellNb] = 0;
            else table[cellNb] -= reducerValue;
            cellNb++;
    }   }
 }

-/*! ZSTD_reduceTable() :
- *  reduce table indexes by `reducerValue` */
-static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue)
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
 {
-    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
-    assert(size < (1U<<31));   /* can be casted to int */
-    ZSTD_reduceTable_internal(table, size/ZSTD_ROWSIZE, reducerValue);
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
 }

+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+
 /*! ZSTD_ldm_reduceTable() :
 *  reduce table indexes by `reducerValue` */
 static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
@ -1273,8 +1285,9 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
    if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
        U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
        if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
-            ZSTD_preserveUnsortedMark(ms->chainTable, chainSize, reducerValue);
-        ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
    }

    if (ms->hashLog3) {
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@ -32,6 +32,12 @@ extern "C" {
 ***************************************/
 static const U32 g_searchStrength = 8;
 #define HASH_READ_SIZE 8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */


 /*-*************************************
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@ -15,48 +15,6 @@
 /*-*************************************
 *  Binary Tree search
 ***************************************/
-#define ZSTD_DUBT_UNSORTED_MARK 1   /* note : index 1 will now be confused with "unsorted" if sorted as larger than its predecessor.
-                                       It's not a big deal though : the candidate will just be considered unsorted, and be sorted again.
-                                       Additionnally, candidate position 1 will be lost.
-                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
-
-/*! ZSTD_preserveUnsortedMark() :
- *  pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK before ZSTD_reduceTable()
- *  so that combined operation preserves its value.
- *  Without it, ZSTD_DUBT_UNSORTED_MARK==1 would be squashed to 0.
- *  As a consequence, the list of unsorted elements would stop at first element,
- *  removing candidates, resulting in a very small loss to compression ratio
- *  (since overflow protection with ZSTD_reduceTable() is relatively rare).
- *
- *  Another potential risk is that a position will be promoted from *unsorted*
- *  to *sorted=>smaller:0*, meaning next candidate will be considered smaller.
- *  This could be wrong, and result in data corruption.
- *
- *  On second thought, this corruption might be impossible,
- *  because unsorted elements stand at the beginning of the list,
- *  and squashing to zero reduces the list to a single element,
- *  which needs to be sorted anyway.
- *  I haven't spent much thoughts into this possible scenario,
- *  and just felt it was safer to implement ZSTD_preserveUnsortedMark()
- *
- * `size` : must be a positive multiple of ZSTD_ROWSIZE */
-#define ZSTD_ROWSIZE 16
-void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue)
-{
-    int cellNb = 0;
-    U32 const nbRows = size / ZSTD_ROWSIZE;
-    U32 rowNb;
-    assert((size % ZSTD_ROWSIZE) == 0);
-    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
-        int column;
-        for (column=0; column<ZSTD_ROWSIZE; column++) {
-            U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
-            table[cellNb] += adder;
-            cellNb++;
-    }   }
-}
-

 void ZSTD_updateDUBT(
                ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,