[LDM] Speed optimization on repetitive data

LDM does especially poorly on repetitive data when that data's hash happens to have `(hash & stopMask) == 0`. Either because the `stopMask == 0` or random chance. Optimize this case by skipping over repetitive patterns. The detection is very simplistic, but should catch most of the offending cases. ``` head -c 1G /dev/zero | perf stat -- ./zstd -1 -o /dev/null -v --zstd=ldmHashRateLog=1 --long 21.187881087 seconds time elapsed head -c 1G /dev/zero | perf stat -- ./zstd -1 -o /dev/null -v --zstd=ldmHashRateLog=1 --long 1.149707921 seconds time elapsed ```
2021-05-03 14:32:15 -07:00 · 2021-05-03 14:32:15 -07:00 · 32823bc150
parent 0e2345b859
commit 32823bc150
1 changed files with 47 additions and 12 deletions
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const*
    }
 }

+/** ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering any
+ * splits. This effectively resets the hash state. This is used when skipping
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash = state->rolling;
+    size_t n = 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
 /** ZSTD_ldm_gear_feed():
 *
 * Registers in the splits array all the split points found in the first
@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable(
    while (ip < iend) {
        size_t hashed;
        unsigned n;
-        
+
        numSplits = 0;
        hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);

@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal(

    /* Initialize the rolling hash state with the first minMatchLength bytes */
    ZSTD_ldm_gear_init(&hashState, params);
-    {
-        size_t n = 0;
-
-        while (n < minMatchLength) {
-            numSplits = 0;
-            n += ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength - n,
-                                    splits, &numSplits);
-        }
-        ip += minMatchLength;
-    }
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip += minMatchLength;

    while (ip < ilimit) {
        size_t hashed;
@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
        for (n = 0; n < numSplits; n++) {
            size_t forwardMatchLength = 0, backwardMatchLength = 0,
                   bestMatchLength = 0, mLength;
+            U32 offset;
            BYTE const* const split = candidates[n].split;
            U32 const checksum = candidates[n].checksum;
            U32 const hash = candidates[n].hash;
@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal(
            }

            /* Match found */
+            offset = (U32)(split - base) - bestEntry->offset;
            mLength = forwardMatchLength + backwardMatchLength;
            {
-                U32 const offset = (U32)(split - base) - bestEntry->offset;
                rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;

                /* Out of sequence storage */
@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal(
            ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);

            anchor = split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hashed
+             * then we have a repeating, overlapping, pattern. E.g. all zeros.
+             * If one repetition of the pattern matches our `stopMask` then all
+             * repetitions will. We don't need to insert them all into out table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
+                /* Continue the outter loop at anchor (ip + hashed == anchor). */
+                ip = anchor - hashed;
+                break;
+            }
        }

        ip += hashed;