From 94efb1749d5b6fd4ce19d7299c1014999bea28cd Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Sat, 3 Feb 2018 23:54:10 -0800
Subject: [PATCH 1/3] faster decoding in 32-bits mode for long offsets
 (tentative)

On my laptop:
Before:
./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S
 3#silesia.tar       : 211984896 ->  66683478 (3.179),  97.6 MB/s , 400.7 MB/s
 3#enwik8            : 100000000 ->  35643153 (2.806),  76.5 MB/s , 303.2 MB/s

After:
./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S
 3#silesia.tar       : 211984896 ->  66683478 (3.179),  97.4 MB/s , 435.0 MB/s
 3#enwik8            : 100000000 ->  35643153 (2.806),  76.2 MB/s , 338.1 MB/s

Mileage vary, depending on file, and cpu type.
But a generic rule is : x86 benefits less from "long-offset mode" than x64,
maybe due to register pressure.
On "entropy", long-mode is _never_ a win for x86.
On my laptop though, it may, depending on file and compression level
(enwik8 benefits more from "long-mode" than silesia).
---
 lib/decompress/zstd_decompress.c | 43 +++++++++++++++++---------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 6cbce9be..1a1fb250 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -23,7 +23,7 @@
 
 /*!
 *  LEGACY_SUPPORT :
-*  if set to 1, ZSTD_decompress() can decode older formats (v0.1+)
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
 */
 #ifndef ZSTD_LEGACY_SUPPORT
 #  define ZSTD_LEGACY_SUPPORT 0
@@ -235,8 +235,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
 
 
 /*-*************************************************************
-*   Decompression section
-***************************************************************/
+ *   Frame header decoding
+ ***************************************************************/
 
 /*! ZSTD_isFrame() :
  *  Tells if the content of `buffer` starts with a valid Frame Identifier.
@@ -258,7 +258,7 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size)
 
 /** ZSTD_frameHeaderSize_internal() :
  *  srcSize must be large enough to reach header size fields.
- *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
  * @return : size of the Frame Header
  *           or an error code, which can be tested with ZSTD_isError() */
 static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
@@ -481,6 +481,10 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he
 }
 
 
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
 /*! ZSTD_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
@@ -936,13 +940,14 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
             ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
             ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
             assert(ofBits <= MaxOff);
-            if (MEM_32bits() && longOffsets) {
-                U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
+            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
                 offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
-                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
+                BIT_reloadDStream(&seqState->DStream);
                 if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
             } else {
-                offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
             }
         }
@@ -955,7 +960,7 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
                 if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
                 seqState->prevOffset[1] = seqState->prevOffset[0];
                 seqState->prevOffset[0] = offset = temp;
-            } else {
+            } else {  /* offset == 0 */
                 offset = seqState->prevOffset[0];
             }
         } else {
@@ -967,16 +972,16 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
     }
 
     seq.matchLength = ML_base[mlCode]
-                    + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+                    + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0);  /* <=  16 bits */
     if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
         BIT_reloadDStream(&seqState->DStream);
     if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
         BIT_reloadDStream(&seqState->DStream);
-    /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
+    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
     ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
     seq.litLength = LL_base[llCode]
-                  + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+                  + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0);    /* <=  16 bits */
     if (MEM_32bits())
         BIT_reloadDStream(&seqState->DStream);
 
@@ -1364,13 +1369,13 @@ static size_t ZSTD_decompressSequencesLong(
         FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
 
         /* prepare in advance */
-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb<seqAdvance; seqNb++) {
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
             sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
         }
         if (seqNb<seqAdvance) return ERROR(corruption_detected);
 
         /* decode and decompress */
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb<nbSeq ; seqNb++) {
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
             seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
             size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
@@ -1411,13 +1416,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * If we are in block mode we don't know the window size, so we have to be
-     * conservative.
+     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
-    /* windowSize could be any value at this point, since it is only validated
-     * in the streaming API.
-     */
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
 
     if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
@@ -1429,7 +1430,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += litCSize;
         srcSize -= litCSize;
     }
-    if (frame && dctx->fParams.windowSize > (1<<23))
+    if ( frame /* windowSize exists */
+      && (dctx->fParams.windowSize > (1<<24))
+      && MEM_64bits() /* x86 benefits less from long mode than x64 */ )
         return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, isLongOffset);
     return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, isLongOffset);
 }

From 0170cf9a7af94d9ffdcdb5cd3ebc35fcd542f2ed Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 5 Feb 2018 11:46:02 -0800
Subject: [PATCH 2/3] minor : modified ZSTD_preserveUnsortedMark() to be more
 vectorization friendly

---
 lib/common/fse_decompress.c      |  4 ++--
 lib/compress/zstd_lazy.c         | 37 +++++++++++++++++++++-----------
 lib/decompress/zstd_decompress.c |  4 ++--
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/lib/common/fse_decompress.c b/lib/common/fse_decompress.c
index 8e3f0035..4c66c3b7 100644
--- a/lib/common/fse_decompress.c
+++ b/lib/common/fse_decompress.c
@@ -139,8 +139,8 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
     {   U32 u;
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
-            U16 nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 7f59eb34..844afc05 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -18,33 +18,46 @@
 #define ZSTD_DUBT_UNSORTED_MARK 1   /* note : index 1 will now be confused with "unsorted" if sorted as larger than its predecessor.
                                        It's not a big deal though : the candidate will just be considered unsorted, and be sorted again.
                                        Additionnally, candidate position 1 will be lost.
-                                       But candidate 1 cannot hide a large tree of candidates, so it's a moderate loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled by a table re-use using a different strategy */
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
 
 /*! ZSTD_preserveUnsortedMark() :
  *  pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK before ZSTD_reduceTable()
  *  so that combined operation preserves its value.
  *  Without it, ZSTD_DUBT_UNSORTED_MARK==1 would be squashed to 0.
- *  As a consequence, the list of unsorted elements would stop on the first element,
- *  removing candidates, resulting in a negligible loss to compression ratio
+ *  As a consequence, the list of unsorted elements would stop at first element,
+ *  removing candidates, resulting in a very small loss to compression ratio
  *  (since overflow protection with ZSTD_reduceTable() is relatively rare).
+ *
  *  Another potential risk is that a position will be promoted from *unsorted*
- *  to *sorted=>smaller:0*, meaning the next candidate will be considered smaller.
+ *  to *sorted=>smaller:0*, meaning next candidate will be considered smaller.
  *  This could be wrong, and result in data corruption.
+ *
  *  On second thought, this corruption might be impossible,
- *  because unsorted elements are always at the beginning of the list,
- *  and squashing to zero reduce the list to a single element,
+ *  because unsorted elements stand at the beginning of the list,
+ *  and squashing to zero reduces the list to a single element,
  *  which needs to be sorted anyway.
  *  I haven't spent much thoughts into this possible scenario,
- *  and just felt it was safer to implement ZSTD_preserveUnsortedMark() */
+ *  and just felt it was safer to implement ZSTD_preserveUnsortedMark()
+ *
+ * `size` : must be a positive multiple of ZSTD_ROWSIZE */
+#define ZSTD_ROWSIZE 16
 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue)
 {
-    U32 u;
-    for (u=0; u<size; u++)
-        if (table[u] == ZSTD_DUBT_UNSORTED_MARK)
-            table[u] = ZSTD_DUBT_UNSORTED_MARK + reducerValue;
+    int cellNb = 0;
+    U32 const nbRows = size / ZSTD_ROWSIZE;
+    U32 rowNb;
+    assert((size % ZSTD_ROWSIZE) == 0);
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+            table[cellNb] += adder;
+            cellNb++;
+    }   }
 }
 
+
 void ZSTD_updateDUBT(
                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
                 const BYTE* ip, const BYTE* iend,
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 1a1fb250..afae80d1 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -655,6 +655,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
 typedef union {
     FSE_decode_t realData;
+    FSE_DTable dtable;
     U32 alignedBy4;
 } FSE_decode_t4;
 
@@ -733,7 +734,6 @@ static size_t ZSTD_buildSeqTable(FSE_DTable* DTableSpace, const FSE_DTable** DTa
                                  const void* src, size_t srcSize,
                                  const FSE_decode_t4* defaultTable, U32 flagRepeatTable)
 {
-    const void* const tmpPtr = defaultTable;   /* bypass strict aliasing */
     switch(type)
     {
     case set_rle :
@@ -743,7 +743,7 @@ static size_t ZSTD_buildSeqTable(FSE_DTable* DTableSpace, const FSE_DTable** DTa
         *DTablePtr = DTableSpace;
         return 1;
     case set_basic :
-        *DTablePtr = (const FSE_DTable*)tmpPtr;
+        *DTablePtr = &defaultTable->dtable;
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);

From de68c2ff10f3800f5aec60abd26e63f00b5f32bc Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Wed, 7 Feb 2018 14:22:35 -0800
Subject: [PATCH 3/3] Merged ZSTD_preserveUnsortedMark() into
 ZSTD_reduceIndex()

as it's faster, due to one memory scan instead of two
(confirmed by microbenchmark).

Note : as ZSTD_reduceIndex() is rarely invoked,
it does not translate into a visible gain.
Consider it an exercise in auto-vectorization and micro-benchmarking.
---
 lib/compress/zstd_compress.c          | 39 ++++++++++++++++---------
 lib/compress/zstd_compress_internal.h |  6 ++++
 lib/compress/zstd_lazy.c              | 42 ---------------------------
 3 files changed, 32 insertions(+), 55 deletions(-)

diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 4a825f65..3f7ac849 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -1223,32 +1223,44 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long
 
 
 #define ZSTD_ROWSIZE 16
-/*! ZSTD_reduceTable_internal() :
- *  reduce table indexes by `reducerValue`
- *  presume table size is a multiple of ZSTD_ROWSIZE.
- *  Helps auto-vectorization */
-static void ZSTD_reduceTable_internal (U32* const table, int const nbRows, U32 const reducerValue)
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
 {
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
     int cellNb = 0;
     int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
     for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
         int column;
         for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
             if (table[cellNb] < reducerValue) table[cellNb] = 0;
             else table[cellNb] -= reducerValue;
             cellNb++;
     }   }
 }
 
-/*! ZSTD_reduceTable() :
- *  reduce table indexes by `reducerValue` */
-static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue)
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
 {
-    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
-    assert(size < (1U<<31));   /* can be casted to int */
-    ZSTD_reduceTable_internal(table, size/ZSTD_ROWSIZE, reducerValue);
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
 }
 
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+
 /*! ZSTD_ldm_reduceTable() :
  *  reduce table indexes by `reducerValue` */
 static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
@@ -1273,8 +1285,9 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
     if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
         U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
         if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
-            ZSTD_preserveUnsortedMark(ms->chainTable, chainSize, reducerValue);
-        ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
     }
 
     if (ms->hashLog3) {
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index cf593658..35b153fa 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -32,6 +32,12 @@ extern "C" {
 ***************************************/
 static const U32 g_searchStrength = 8;
 #define HASH_READ_SIZE 8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
 
 
 /*-*************************************
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 844afc05..8252135d 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -15,48 +15,6 @@
 /*-*************************************
 *  Binary Tree search
 ***************************************/
-#define ZSTD_DUBT_UNSORTED_MARK 1   /* note : index 1 will now be confused with "unsorted" if sorted as larger than its predecessor.
-                                       It's not a big deal though : the candidate will just be considered unsorted, and be sorted again.
-                                       Additionnally, candidate position 1 will be lost.
-                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
-
-/*! ZSTD_preserveUnsortedMark() :
- *  pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK before ZSTD_reduceTable()
- *  so that combined operation preserves its value.
- *  Without it, ZSTD_DUBT_UNSORTED_MARK==1 would be squashed to 0.
- *  As a consequence, the list of unsorted elements would stop at first element,
- *  removing candidates, resulting in a very small loss to compression ratio
- *  (since overflow protection with ZSTD_reduceTable() is relatively rare).
- *
- *  Another potential risk is that a position will be promoted from *unsorted*
- *  to *sorted=>smaller:0*, meaning next candidate will be considered smaller.
- *  This could be wrong, and result in data corruption.
- *
- *  On second thought, this corruption might be impossible,
- *  because unsorted elements stand at the beginning of the list,
- *  and squashing to zero reduces the list to a single element,
- *  which needs to be sorted anyway.
- *  I haven't spent much thoughts into this possible scenario,
- *  and just felt it was safer to implement ZSTD_preserveUnsortedMark()
- *
- * `size` : must be a positive multiple of ZSTD_ROWSIZE */
-#define ZSTD_ROWSIZE 16
-void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue)
-{
-    int cellNb = 0;
-    U32 const nbRows = size / ZSTD_ROWSIZE;
-    U32 rowNb;
-    assert((size % ZSTD_ROWSIZE) == 0);
-    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
-        int column;
-        for (column=0; column<ZSTD_ROWSIZE; column++) {
-            U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
-            table[cellNb] += adder;
-            cellNb++;
-    }   }
-}
-
 
 void ZSTD_updateDUBT(
                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,