Use a single buffer in zstdmt

Summary: Allocate a single input buffer large enough to house each job, as well as enough space for the IO thread to write 2 extra buffers. One goes in the `POOL` queue, and one to fill, and then block on a full `POOL` queue. Since we can't overlap with the prefix, we allocate space for 3 extra input buffers. Test Plan: * CI * With and without ASAN/UBSAN run zstdmt with different number of threads on two large binaries, and verify that their checksums match. * Test on the tip of the zstdmt ldm integration. Reviewers: cyan Differential Revision: https://phabricator.intern.facebook.com/D7284007 Tasks: T25664120
2018-02-27 20:09:18 -08:00 · 2018-02-27 20:09:18 -08:00 · f15a17e19f
parent a271399c97
commit f15a17e19f
2 changed files with 246 additions and 117 deletions
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@ -309,6 +309,13 @@ static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
 /* =====          Worker thread         ===== */
 /* ------------------------------------------ */

+typedef struct {
+    void const* start;
+    size_t size;
+} range_t;
+
+static const range_t kNullRange = { NULL, 0 };
+
 typedef struct {
    size_t   consumed;                   /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */
    size_t   cSize;                      /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */
@ -317,10 +324,8 @@ typedef struct {
    ZSTDMT_CCtxPool* cctxPool;           /* Thread-safe - used by mtctx and (all) workers */
    ZSTDMT_bufferPool* bufPool;          /* Thread-safe - used by mtctx and (all) workers */
    buffer_t dstBuff;                    /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */
-    buffer_t srcBuff;                    /* set by mtctx, then released by worker => no barrier */
-    const void* prefixStart;             /* set by mtctx, then read and set0 by worker => no barrier */
-    size_t   prefixSize;                 /* set by mtctx, then read by worker => no barrier */
-    size_t   srcSize;                    /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t prefix;                      /* set by mtctx, then read by worker & mtctx => no barrier */
+    range_t src;                         /* set by mtctx, then read by worker & mtctx => no barrier */
    unsigned firstJob;                   /* set by mtctx, then read by worker => no barrier */
    unsigned lastJob;                    /* set by mtctx, then read by worker => no barrier */
    ZSTD_CCtx_params params;             /* set by mtctx, then read by worker => no barrier */
@ -335,7 +340,6 @@ void ZSTDMT_compressionJob(void* jobDescription)
 {
    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool);
-    const void* const src = (const char*)job->prefixStart + job->prefixSize;
    buffer_t dstBuff = job->dstBuff;

    /* ressources */
@ -358,7 +362,7 @@ void ZSTDMT_compressionJob(void* jobDescription)
        assert(job->firstJob);  /* only allowed for first job */
        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
    } else {  /* srcStart points at reloaded section */
-        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->srcSize;
+        U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size;
        ZSTD_CCtx_params jobParams = job->params;   /* do not modify job->params ! copy it, modify the copy */
        {   size_t const forceWindowError = ZSTD_CCtxParam_setParameter(&jobParams, ZSTD_p_forceMaxWindow, !job->firstJob);
            if (ZSTD_isError(forceWindowError)) {
@ -366,7 +370,7 @@ void ZSTDMT_compressionJob(void* jobDescription)
                goto _endJob;
        }   }
        {   size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
-                                        job->prefixStart, job->prefixSize, ZSTD_dm_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
+                                        job->prefix.start, job->prefix.size, ZSTD_dm_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
                                        NULL, /*cdict*/
                                        jobParams, pledgedSrcSize);
            if (ZSTD_isError(initError)) {
@ -374,7 +378,7 @@ void ZSTDMT_compressionJob(void* jobDescription)
                goto _endJob;
    }   }   }
    if (!job->firstJob) {  /* flush and overwrite frame header when it's not first job */
-        size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, src, 0);
+        size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0);
        if (ZSTD_isError(hSize)) { job->cSize = hSize; /* save error code */ goto _endJob; }
        DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize);
        ZSTD_invalidateRepCodes(cctx);
@ -382,14 +386,14 @@ void ZSTDMT_compressionJob(void* jobDescription)

    /* compress */
    {   size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX;
-        int const nbChunks = (int)((job->srcSize + (chunkSize-1)) / chunkSize);
-        const BYTE* ip = (const BYTE*) src;
+        int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize);
+        const BYTE* ip = (const BYTE*) job->src.start;
        BYTE* const ostart = (BYTE*)dstBuff.start;
        BYTE* op = ostart;
        BYTE* oend = op + dstBuff.capacity;
        int chunkNb;
-        if (sizeof(size_t) > sizeof(int)) assert(job->srcSize < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
-        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->srcSize, nbChunks);
+        if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize);   /* check overflow */
+        DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks);
        assert(job->cSize == 0);
        for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) {
            size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize);
@ -408,8 +412,8 @@ void ZSTDMT_compressionJob(void* jobDescription)
        /* last block */
        assert(chunkSize > 0); assert((chunkSize & (chunkSize - 1)) == 0);  /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */
        if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) {
-            size_t const lastBlockSize1 = job->srcSize & (chunkSize-1);
-            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->srcSize>=chunkSize)) ? chunkSize : lastBlockSize1;
+            size_t const lastBlockSize1 = job->src.size & (chunkSize-1);
+            size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1;
            size_t const cSize = (job->lastJob) ?
                 ZSTD_compressEnd     (cctx, op, oend-op, ip, lastBlockSize) :
                 ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize);
@ -421,13 +425,14 @@ void ZSTDMT_compressionJob(void* jobDescription)
    }   }

 _endJob:
+    if (job->prefix.size > 0)
+        DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start);
+    DEBUGLOG(5, "Finished with source: %zx", (size_t)job->src.start);
    /* release resources */
    ZSTDMT_releaseCCtx(job->cctxPool, cctx);
-    ZSTDMT_releaseBuffer(job->bufPool, job->srcBuff);
-    job->srcBuff = g_nullBuffer; job->prefixStart = NULL;
    /* report */
    ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex);
-    job->consumed = job->srcSize;
+    job->consumed = job->src.size;
    ZSTD_pthread_cond_signal(&job->job_cond);
    ZSTD_pthread_mutex_unlock(&job->job_mutex);
 }
@ -438,12 +443,27 @@ _endJob:
 /* ------------------------------------------ */

 typedef struct {
+    range_t prefix;         /* read-only non-owned prefix buffer */
    buffer_t buffer;
-    size_t targetCapacity;  /* note : buffers provided by the pool may be larger than target capacity */
-    size_t prefixSize;
    size_t filled;
 } inBuff_t;

+typedef struct {
+  BYTE* buffer;     /* The round input buffer. All jobs get references
+                     * to pieces of the buffer. ZSTDMT_tryGetInputRange()
+                     * handles handing out job input buffers, and makes
+                     * sure it doesn't overlap with any pieces still in use.
+                     */
+  size_t capacity;  /* The capacity of buffer. */
+  size_t pos;       /* The position of the current inBuff in the round
+                     * buffer. Updated past the end if the inBuff once
+                     * the inBuff is sent to the worker thread.
+                     * pos <= capacity.
+                     */
+} roundBuff_t;
+
+static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
+
 struct ZSTDMT_CCtx_s {
    POOL_ctx* factory;
    ZSTDMT_jobDescription* jobs;
@ -452,6 +472,7 @@ struct ZSTDMT_CCtx_s {
    ZSTD_CCtx_params params;
    size_t targetSectionSize;
    size_t targetPrefixSize;
+    roundBuff_t roundBuff;
    inBuff_t inBuff;
    int jobReady;        /* 1 => one job is already prepared, but pool has shortage of workers. Don't create another one. */
    XXH64_state_t xxhState;
@ -538,6 +559,7 @@ ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem)
    mtctx->jobIDMask = nbJobs - 1;
    mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
    mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
+    mtctx->roundBuff = kNullRoundBuff;
    if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool) {
        ZSTDMT_freeCCtx(mtctx);
        return NULL;
@ -563,14 +585,10 @@ static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff);
        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
        mtctx->jobs[jobID].cSize = 0;
-        DEBUGLOG(4, "job%02u: release src address %08X", jobID, (U32)(size_t)mtctx->jobs[jobID].srcBuff.start);
-        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].srcBuff);
-        mtctx->jobs[jobID].srcBuff = g_nullBuffer;
    }
    memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription));
-    DEBUGLOG(4, "input: release address %08X", (U32)(size_t)mtctx->inBuff.buffer.start);
-    ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->inBuff.buffer);
    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->inBuff.filled = 0;
    mtctx->allJobsCompleted = 1;
 }

@ -580,7 +598,7 @@ static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx)
    while (mtctx->doneJobID < mtctx->nextJobID) {
        unsigned const jobID = mtctx->doneJobID & mtctx->jobIDMask;
        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
-        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].srcSize) {
+        while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
            DEBUGLOG(5, "waiting for jobCompleted signal from job %u", mtctx->doneJobID);   /* we want to block when waiting for data to flush */
            ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
        }
@ -598,6 +616,8 @@ size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
    ZSTDMT_freeBufferPool(mtctx->bufPool);
    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
    ZSTD_freeCDict(mtctx->cdictLocal);
+    if (mtctx->roundBuff.buffer)
+        ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
    ZSTD_free(mtctx, mtctx->cMem);
    return 0;
 }
@ -610,7 +630,8 @@ size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx)
            + ZSTDMT_sizeof_bufferPool(mtctx->bufPool)
            + (mtctx->jobIDMask+1) * sizeof(ZSTDMT_jobDescription)
            + ZSTDMT_sizeof_CCtxPool(mtctx->cctxPool)
-            + ZSTD_sizeof_CDict(mtctx->cdictLocal);
+            + ZSTD_sizeof_CDict(mtctx->cdictLocal)
+            + mtctx->roundBuff.capacity;
 }

 /* Internal only */
@ -701,8 +722,7 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
    DEBUGLOG(6, "ZSTDMT_getFrameProgression");
    fps.consumed = mtctx->consumed;
    fps.produced = mtctx->produced;
-    assert(mtctx->inBuff.filled >= mtctx->inBuff.prefixSize);
-    fps.ingested = mtctx->consumed + (mtctx->inBuff.filled - mtctx->inBuff.prefixSize);
+    fps.ingested = mtctx->consumed + mtctx->inBuff.filled;
    {   unsigned jobNb;
        unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1);
        DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)",
@ -713,7 +733,7 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx)
            {   size_t const cResult = mtctx->jobs[wJobID].cSize;
                size_t const produced = ZSTD_isError(cResult) ? 0 : cResult;
                fps.consumed += mtctx->jobs[wJobID].consumed;
-                fps.ingested += mtctx->jobs[wJobID].srcSize;
+                fps.ingested += mtctx->jobs[wJobID].src.size;
                fps.produced += produced;
            }
            ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
@ -795,10 +815,10 @@ static size_t ZSTDMT_compress_advanced_internal(
            buffer_t const dstBuffer = u < compressWithinDst ? dstAsBuffer : g_nullBuffer;
            size_t dictSize = u ? overlapSize : 0;

-            mtctx->jobs[u].srcBuff = g_nullBuffer;
-            mtctx->jobs[u].prefixStart = srcStart + frameStartPos - dictSize;
-            mtctx->jobs[u].prefixSize = dictSize;
-            mtctx->jobs[u].srcSize = jobSize; assert(jobSize > 0);  /* avoid job.srcSize == 0 */
+            mtctx->jobs[u].prefix.start = srcStart + frameStartPos - dictSize;
+            mtctx->jobs[u].prefix.size = dictSize;
+            mtctx->jobs[u].src.start = srcStart + frameStartPos;
+            mtctx->jobs[u].src.size = jobSize; assert(jobSize > 0);  /* avoid job.src.size == 0 */
            mtctx->jobs[u].consumed = 0;
            mtctx->jobs[u].cSize = 0;
            mtctx->jobs[u].cdict = (u==0) ? cdict : NULL;
@ -817,7 +837,7 @@ static size_t ZSTDMT_compress_advanced_internal(
            }

            DEBUGLOG(5, "ZSTDMT_compress_advanced_internal: posting job %u  (%u bytes)", u, (U32)jobSize);
-            DEBUG_PRINTHEX(6, mtctx->jobs[u].prefixStart, 12);
+            DEBUG_PRINTHEX(6, mtctx->jobs[u].prefix.start, 12);
            POOL_add(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[u]);

            frameStartPos += jobSize;
@ -831,14 +851,13 @@ static size_t ZSTDMT_compress_advanced_internal(
        for (jobID=0; jobID<nbJobs; jobID++) {
            DEBUGLOG(5, "waiting for job %u ", jobID);
            ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[jobID].job_mutex);
-            while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].srcSize) {
+            while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) {
                DEBUGLOG(5, "waiting for jobCompleted signal from job %u", jobID);
                ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex);
            }
            ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex);
            DEBUGLOG(5, "ready to write job %u ", jobID);

-            mtctx->jobs[jobID].prefixStart = NULL;
            {   size_t const cSize = mtctx->jobs[jobID].cSize;
                if (ZSTD_isError(cSize)) error = cSize;
                if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall);
@ -967,11 +986,30 @@ size_t ZSTDMT_initCStream_internal(
    if (mtctx->targetSectionSize < ZSTDMT_JOBSIZE_MIN) mtctx->targetSectionSize = ZSTDMT_JOBSIZE_MIN;
    if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
    DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), params.jobSize);
-    mtctx->inBuff.targetCapacity = mtctx->targetPrefixSize + mtctx->targetSectionSize;
-    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->inBuff.targetCapacity>>10));
-    ZSTDMT_setBufferSize(mtctx->bufPool, MAX(mtctx->inBuff.targetCapacity, ZSTD_compressBound(mtctx->targetSectionSize)) );
+    DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
+    ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
+    {
+        /* Two buffers of slack, plus extra space for the overlap */
+        size_t const nbWorkers = MAX(mtctx->params.nbWorkers, 1);
+        size_t const nbSlackBuffers = MIN(nbWorkers, 2) + (mtctx->targetPrefixSize > 0);
+        size_t const nbSections = nbWorkers + nbSlackBuffers;
+        size_t const capacity = mtctx->targetSectionSize * nbSections;
+        if (mtctx->roundBuff.capacity < capacity) {
+            if (mtctx->roundBuff.buffer)
+                ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem);
+            mtctx->roundBuff.buffer = (BYTE*)ZSTD_malloc(capacity, mtctx->cMem);
+            if (mtctx->roundBuff.buffer == NULL) {
+                mtctx->roundBuff.capacity = 0;
+                return ERROR(memory_allocation);
+            }
+            mtctx->roundBuff.capacity = capacity;
+        }
+    }
+    DEBUGLOG(4, "roundBuff capacity : %u KB", (U32)(mtctx->roundBuff.capacity>>10));
+    mtctx->roundBuff.pos = 0;
    mtctx->inBuff.buffer = g_nullBuffer;
-    mtctx->inBuff.prefixSize = 0;
+    mtctx->inBuff.filled = 0;
+    mtctx->inBuff.prefix = kNullRange;
    mtctx->doneJobID = 0;
    mtctx->nextJobID = 0;
    mtctx->frameEnded = 0;
@ -1038,17 +1076,16 @@ size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel) {
 static void ZSTDMT_writeLastEmptyBlock(ZSTDMT_jobDescription* job)
 {
    assert(job->lastJob == 1);
-    assert(job->srcSize == 0);    /* last job is empty -> will be simplified into a last empty block */
+    assert(job->src.size == 0);   /* last job is empty -> will be simplified into a last empty block */
    assert(job->firstJob == 0);   /* cannot be first job, as it also needs to create frame header */
-    /* A job created by streaming variant starts with a src buffer, but no dst buffer.
-     * It summons a dstBuffer itself, compresses into it, then releases srcBuffer, and gives result to mtctx.
-     * When done, srcBuffer is empty, while dstBuffer is filled, and will be released by mtctx.
-     * This shortcut will simply switch srcBuffer for dstBuffer, providing same outcome as a normal job */
    assert(job->dstBuff.start == NULL);   /* invoked from streaming variant only (otherwise, dstBuff might be user's output) */
-    assert(job->srcBuff.start != NULL);   /* invoked from streaming variant only (otherwise, srcBuff might be user's input) */
-    assert(job->srcBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
-    job->dstBuff = job->srcBuff;
-    job->srcBuff = g_nullBuffer;
+    job->dstBuff = ZSTDMT_getBuffer(job->bufPool);
+    if (job->dstBuff.start == NULL) {
+      job->cSize = ERROR(memory_allocation);
+      return;
+    }
+    assert(job->dstBuff.capacity >= ZSTD_blockHeaderSize);   /* no buffer should ever be that small */
+    job->src = kNullRange;
    job->cSize = ZSTD_writeLastEmptyBlock(job->dstBuff.start, job->dstBuff.capacity);
    assert(!ZSTD_isError(job->cSize));
    assert(job->consumed == 0);
@ -1066,14 +1103,13 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS
    }

    if (!mtctx->jobReady) {
+        BYTE const* src = (BYTE const*)mtctx->inBuff.buffer.start;
        DEBUGLOG(5, "ZSTDMT_createCompressionJob: preparing job %u to compress %u bytes with %u preload ",
-                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefixSize);
-        assert(mtctx->jobs[jobID].srcBuff.start == NULL);   /* no buffer left : supposed already released */
-        mtctx->jobs[jobID].srcBuff = mtctx->inBuff.buffer;
-        mtctx->jobs[jobID].prefixStart = mtctx->inBuff.buffer.start;
-        mtctx->jobs[jobID].prefixSize = mtctx->inBuff.prefixSize;
-        mtctx->jobs[jobID].srcSize = srcSize;
-        assert(mtctx->inBuff.filled >= srcSize + mtctx->inBuff.prefixSize);
+                    mtctx->nextJobID, (U32)srcSize, (U32)mtctx->inBuff.prefix.size);
+        mtctx->jobs[jobID].src.start = src;
+        mtctx->jobs[jobID].src.size = srcSize;
+        assert(mtctx->inBuff.filled >= srcSize);
+        mtctx->jobs[jobID].prefix = mtctx->inBuff.prefix;
        mtctx->jobs[jobID].consumed = 0;
        mtctx->jobs[jobID].cSize = 0;
        mtctx->jobs[jobID].params = mtctx->params;
@ -1089,29 +1125,20 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS
        mtctx->jobs[jobID].frameChecksumNeeded = endFrame && (mtctx->nextJobID>0) && mtctx->params.fParams.checksumFlag;
        mtctx->jobs[jobID].dstFlushed = 0;

-        if (mtctx->params.fParams.checksumFlag)
-            XXH64_update(&mtctx->xxhState, (const char*)mtctx->inBuff.buffer.start + mtctx->inBuff.prefixSize, srcSize);
+        if (mtctx->params.fParams.checksumFlag && srcSize > 0)
+            XXH64_update(&mtctx->xxhState, src, srcSize);

-        /* get a new buffer for next input */
+        /* Update the round buffer pos and clear the input buffer to be reset */
+        mtctx->roundBuff.pos += srcSize;
+        mtctx->inBuff.buffer = g_nullBuffer;
+        mtctx->inBuff.filled = 0;
+        /* Set the prefix */
        if (!endFrame) {
-            size_t const newPrefixSize = MIN(mtctx->inBuff.filled, mtctx->targetPrefixSize);
-            mtctx->inBuff.buffer = ZSTDMT_getBuffer(mtctx->bufPool);
-            if (mtctx->inBuff.buffer.start == NULL) {    /* not enough memory to allocate a new input buffer */
-                mtctx->jobs[jobID].srcSize = mtctx->jobs[jobID].consumed = 0;
-                mtctx->nextJobID++;
-                ZSTDMT_waitForAllJobsCompleted(mtctx);
-                ZSTDMT_releaseAllJobResources(mtctx);
-                return ERROR(memory_allocation);
-            }
-            mtctx->inBuff.filled -= (mtctx->inBuff.prefixSize + srcSize) - newPrefixSize;
-            memmove(mtctx->inBuff.buffer.start,   /* copy end of current job into next job, as "prefix" */
-                (const char*)mtctx->jobs[jobID].prefixStart + mtctx->inBuff.prefixSize + srcSize - newPrefixSize,
-                mtctx->inBuff.filled);
-            mtctx->inBuff.prefixSize = newPrefixSize;
+            size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize);
+            mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize;
+            mtctx->inBuff.prefix.size = newPrefixSize;
        } else {   /* endFrame==1 => no need for another input buffer */
-            mtctx->inBuff.buffer = g_nullBuffer;
-            mtctx->inBuff.filled = 0;
-            mtctx->inBuff.prefixSize = 0;
+            mtctx->inBuff.prefix = kNullRange;
            mtctx->frameEnded = endFrame;
            if (mtctx->nextJobID == 0) {
                /* single job exception : checksum is already calculated directly within worker thread */
@ -1130,7 +1157,7 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS

    DEBUGLOG(5, "ZSTDMT_createCompressionJob: posting job %u : %u bytes  (end:%u, jobNb == %u (mod:%u))",
                mtctx->nextJobID,
-                (U32)mtctx->jobs[jobID].srcSize,
+                (U32)mtctx->jobs[jobID].src.size,
                mtctx->jobs[jobID].lastJob,
                mtctx->nextJobID,
                jobID);
@ -1161,9 +1188,9 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
      && (mtctx->doneJobID < mtctx->nextJobID) ) {
        assert(mtctx->jobs[wJobID].dstFlushed <= mtctx->jobs[wJobID].cSize);
        while (mtctx->jobs[wJobID].dstFlushed == mtctx->jobs[wJobID].cSize) {  /* nothing to flush */
-            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].srcSize) {
+            if (mtctx->jobs[wJobID].consumed == mtctx->jobs[wJobID].src.size) {
                DEBUGLOG(5, "job %u is completely consumed (%u == %u) => don't wait for cond, there will be none",
-                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].srcSize);
+                            mtctx->doneJobID, (U32)mtctx->jobs[wJobID].consumed, (U32)mtctx->jobs[wJobID].src.size);
                break;
            }
            DEBUGLOG(5, "waiting for something to flush from job %u (currently flushed: %u bytes)",
@ -1174,7 +1201,7 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
    /* try to flush something */
    {   size_t cSize = mtctx->jobs[wJobID].cSize;                  /* shared */
        size_t const srcConsumed = mtctx->jobs[wJobID].consumed;   /* shared */
-        size_t const srcSize = mtctx->jobs[wJobID].srcSize;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
+        size_t const srcSize = mtctx->jobs[wJobID].src.size;        /* read-only, could be done after mutex lock, but no-declaration-after-statement */
        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
        if (ZSTD_isError(cSize)) {
            DEBUGLOG(5, "ZSTDMT_flushProduced: job %u : compression error detected : %s",
@ -1211,7 +1238,6 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
              && (mtctx->jobs[wJobID].dstFlushed == cSize) ) {   /* output buffer fully flushed => free this job position */
                DEBUGLOG(5, "Job %u completed (%u bytes), moving to next one",
                        mtctx->doneJobID, (U32)mtctx->jobs[wJobID].dstFlushed);
-                assert(mtctx->jobs[wJobID].srcBuff.start == NULL);  /* srcBuff supposed already released */
                ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[wJobID].dstBuff);
                mtctx->jobs[wJobID].dstBuff = g_nullBuffer;
                mtctx->jobs[wJobID].cSize = 0;   /* ensure this job slot is considered "not started" in future check */
@ -1232,6 +1258,109 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u
    return 0;   /* internal buffers fully flushed */
 }

+/**
+ * Returns the range of data used by the earliest job that is not yet complete.
+ * If the data of the first job is broken up into two segments, we cover both
+ * sections.
+ */
+static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
+{
+    unsigned const firstJobID = mtctx->doneJobID;
+    unsigned const lastJobID = mtctx->nextJobID;
+    unsigned jobID;
+
+    for (jobID = firstJobID; jobID < lastJobID; ++jobID) {
+        unsigned const wJobID = jobID & mtctx->jobIDMask;
+        size_t consumed;
+
+        ZSTD_PTHREAD_MUTEX_LOCK(&mtctx->jobs[wJobID].job_mutex);
+        consumed = mtctx->jobs[wJobID].consumed;
+        ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex);
+
+        if (consumed < mtctx->jobs[wJobID].src.size) {
+            range_t range = mtctx->jobs[wJobID].prefix;
+            if (range.size == 0) {
+                /* Empty prefix */
+                range = mtctx->jobs[wJobID].src;
+            }
+            /* Job source in multiple segments not supported yet */
+            assert(range.start <= mtctx->jobs[wJobID].src.start);
+            return range;
+        }
+    }
+    return kNullRange;
+}
+
+/**
+ * Returns non-zero iff buffer and range overlap.
+ */
+static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
+{
+    BYTE const* const bufferStart = (BYTE const*)buffer.start;
+    BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+    BYTE const* const rangeStart = (BYTE const*)range.start;
+    BYTE const* const rangeEnd = rangeStart + range.size;
+
+    if (rangeStart == NULL || bufferStart == NULL)
+        return 0;
+
+    return bufferStart < rangeEnd && rangeStart < bufferEnd;
+}
+
+/**
+ * Attempts to set the inBuff to the next section to fill.
+ * If any part of the new section is still in use we give up.
+ * Returns non-zero if the buffer is filled.
+ */
+static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx)
+{
+    range_t const inUse = ZSTDMT_getInputDataInUse(mtctx);
+    size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos;
+    size_t const target = mtctx->targetSectionSize;
+    buffer_t buffer;
+
+    assert(mtctx->inBuff.buffer.start == NULL);
+    assert(mtctx->roundBuff.capacity >= target);
+
+    if (spaceLeft < target) {
+        /* ZSTD_invalidateRepCodes() doesn't work for extDict variants.
+         * Simply copy the prefix to the beginning in that case.
+         */
+        BYTE* const start = (BYTE*)mtctx->roundBuff.buffer;
+        size_t const prefixSize = mtctx->inBuff.prefix.size;
+
+        buffer.start = start;
+        buffer.capacity = prefixSize;
+        if (ZSTDMT_isOverlapped(buffer, inUse)) {
+            DEBUGLOG(6, "Waiting for buffer...");
+            return 0;
+        }
+        memmove(start, mtctx->inBuff.prefix.start, prefixSize);
+        mtctx->inBuff.prefix.start = start;
+        mtctx->roundBuff.pos = prefixSize;
+    }
+    buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos;
+    buffer.capacity = target;
+
+    if (ZSTDMT_isOverlapped(buffer, inUse)) {
+        DEBUGLOG(6, "Waiting for buffer...");
+        return 0;
+    }
+    assert(!ZSTDMT_isOverlapped(buffer, mtctx->inBuff.prefix));
+    DEBUGLOG(5, "Using prefix range [%zx, %zx)",
+                (size_t)mtctx->inBuff.prefix.start,
+                (size_t)mtctx->inBuff.prefix.start + mtctx->inBuff.prefix.size);
+    DEBUGLOG(5, "Using source range [%zx, %zx)",
+                (size_t)buffer.start,
+                (size_t)buffer.start + buffer.capacity);
+
+
+    mtctx->inBuff.buffer = buffer;
+    mtctx->inBuff.filled = 0;
+    assert(mtctx->roundBuff.pos + buffer.capacity <= mtctx->roundBuff.capacity);
+    return 1;
+}
+

 /** ZSTDMT_compressStream_generic() :
 *  internal use only - exposed to be invoked from zstd_compress.c
@ -1242,7 +1371,6 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
                                     ZSTD_inBuffer* input,
                                     ZSTD_EndDirective endOp)
 {
-    size_t const newJobThreshold = mtctx->inBuff.prefixSize + mtctx->targetSectionSize;
    unsigned forwardInputProgress = 0;
    DEBUGLOG(5, "ZSTDMT_compressStream_generic (endOp=%u, srcSize=%u)",
                (U32)endOp, (U32)(input->size - input->pos));
@ -1271,7 +1399,6 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
        if (ZSTD_isError(cSize)) return cSize;
        input->pos = input->size;
        output->pos += cSize;
-        ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->inBuff.buffer);  /* was allocated in initStream */
        mtctx->allJobsCompleted = 1;
        mtctx->frameEnded = 1;
        return 0;
@ -1281,18 +1408,19 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
    if ( (!mtctx->jobReady)
      && (input->size > input->pos) ) {   /* support NULL input */
        if (mtctx->inBuff.buffer.start == NULL) {
-            mtctx->inBuff.buffer = ZSTDMT_getBuffer(mtctx->bufPool);  /* note : allocation can fail, in which case, buffer.start==NULL */
-            mtctx->inBuff.filled = 0;
-            if ( (mtctx->inBuff.buffer.start == NULL)        /* allocation failure */
-              && (mtctx->doneJobID == mtctx->nextJobID) ) {  /* and nothing to flush */
-                return ERROR(memory_allocation);    /* no forward progress possible => output an error */
+            assert(mtctx->inBuff.filled == 0); /* Can't fill an empty buffer */
+            if (!ZSTDMT_tryGetInputRange(mtctx)) {
+                /* It is only possible for this operation to fail if there are
+                 * still compression jobs ongoing.
+                 */
+                assert(mtctx->doneJobID != mtctx->nextJobID);
            }
-            assert(mtctx->inBuff.buffer.capacity >= mtctx->inBuff.targetCapacity);  /* pool must provide a buffer >= targetCapacity */
        }
-        if (mtctx->inBuff.buffer.start != NULL) {   /* no buffer for input, but it's possible to flush, and then reclaim the buffer */
-            size_t const toLoad = MIN(input->size - input->pos, mtctx->inBuff.targetCapacity - mtctx->inBuff.filled);
+        if (mtctx->inBuff.buffer.start != NULL) {
+            size_t const toLoad = MIN(input->size - input->pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
+            assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize);
            DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u",
-                        (U32)toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->inBuff.targetCapacity);
+                        (U32)toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize);
            memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, toLoad);
            input->pos += toLoad;
            mtctx->inBuff.filled += toLoad;
@ -1303,10 +1431,11 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
    }

    if ( (mtctx->jobReady)
-      || (mtctx->inBuff.filled >= newJobThreshold)  /* filled enough : let's compress */
+      || (mtctx->inBuff.filled >= mtctx->targetSectionSize)  /* filled enough : let's compress */
      || ((endOp != ZSTD_e_continue) && (mtctx->inBuff.filled > 0))  /* something to flush : let's go */
      || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) {   /* must finish the frame with a zero-size block */
-        size_t const jobSize = MIN(mtctx->inBuff.filled - mtctx->inBuff.prefixSize, mtctx->targetSectionSize);
+        size_t const jobSize = mtctx->inBuff.filled;
+        assert(mtctx->inBuff.filled <= mtctx->targetSectionSize);
        CHECK_F( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) );
    }

@ -1323,13 +1452,13 @@ size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_in
    CHECK_F( ZSTDMT_compressStream_generic(mtctx, output, input, ZSTD_e_continue) );

    /* recommended next input size : fill current input buffer */
-    return mtctx->inBuff.targetCapacity - mtctx->inBuff.filled;   /* note : could be zero when input buffer is fully filled and no more availability to create new job */
+    return mtctx->targetSectionSize - mtctx->inBuff.filled;   /* note : could be zero when input buffer is fully filled and no more availability to create new job */
 }


 static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_EndDirective endFrame)
 {
-    size_t const srcSize = mtctx->inBuff.filled - mtctx->inBuff.prefixSize;
+    size_t const srcSize = mtctx->inBuff.filled;
    DEBUGLOG(5, "ZSTDMT_flushStream_internal");

    if ( mtctx->jobReady     /* one job ready for a worker to pick up */
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@ -213,8 +213,8 @@ roundTripTest -g512K
 roundTripTest -g512K " --zstd=slen=3,tlen=48,strat=6"
 roundTripTest -g512K " --zstd=strat=6,wlog=23,clog=23,hlog=22,slog=6"
 roundTripTest -g512K " --zstd=windowLog=23,chainLog=23,hashLog=22,searchLog=6,searchLength=3,targetLength=48,strategy=6"
-roundTripTest -g512K " --long --zstd=ldmHashLog=20,ldmSearchLength=64,ldmBucketSizeLog=1,ldmHashEveryLog=7"
-roundTripTest -g512K " --long --zstd=ldmhlog=20,ldmslen=64,ldmblog=1,ldmhevery=7"
+roundTripTest -g512K " --single-thread --long --zstd=ldmHashLog=20,ldmSearchLength=64,ldmBucketSizeLog=1,ldmHashEveryLog=7"
+roundTripTest -g512K " --single-thread --long --zstd=ldmhlog=20,ldmslen=64,ldmblog=1,ldmhevery=7"
 roundTripTest -g512K 19


@ -630,12 +630,12 @@ roundTripTest -g516K 19   # btopt
 fileRoundTripTest -g500K

 $ECHO "\n===>  zstd long distance matching round-trip tests "
-roundTripTest -g0 "2 --long"
-roundTripTest -g1000K "1 --long"
-roundTripTest -g517K "6 --long"
-roundTripTest -g516K "16 --long"
-roundTripTest -g518K "19 --long"
-fileRoundTripTest -g5M "3 --long"
+roundTripTest -g0 "2 --single-thread --long"
+roundTripTest -g1000K "1 --single-thread --long"
+roundTripTest -g517K "6 --single-thread --long"
+roundTripTest -g516K "16 --single-thread --long"
+roundTripTest -g518K "19 --single-thread --long"
+fileRoundTripTest -g5M "3 --single-thread --long"


 roundTripTest -g96K "5 --single-thread"
@ -648,7 +648,7 @@ then
    fileRoundTripTest -g4M "19 -T2 -B1M"

    $ECHO "\n===>  zstdmt long distance matching round-trip tests "
-    roundTripTest -g8M "3 --long -T2"
+    roundTripTest -g8M "3 --long=24 -T2"
 else
    $ECHO "\n===>  no multithreading, skipping zstdmt tests "
 fi
@ -699,13 +699,13 @@ rm tmp*


 $ECHO "\n===>   zstd long distance matching tests "
-roundTripTest -g0 " --long"
-roundTripTest -g9M "2 --long"
+roundTripTest -g0 " --single-thread --long"
+roundTripTest -g9M "2 --single-thread --long"
 # Test parameter parsing
-roundTripTest -g1M -P50 "1 --long=29" " --memory=512MB"
-roundTripTest -g1M -P50 "1 --long=29 --zstd=wlog=28" " --memory=256MB"
-roundTripTest -g1M -P50 "1 --long=29" " --long=28 --memory=512MB"
-roundTripTest -g1M -P50 "1 --long=29" " --zstd=wlog=28 --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29 --zstd=wlog=28" " --memory=256MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --long=28 --memory=512MB"
+roundTripTest -g1M -P50 "1 --single-thread --long=29" " --zstd=wlog=28 --memory=512MB"


 if [ "$1" != "--test-large-data" ]; then
@ -746,13 +746,13 @@ fileRoundTripTest -g4193M -P99 1


 $ECHO "\n===>   zstd long, long distance matching round-trip tests "
-roundTripTest -g270000000 "1 --long"
-roundTripTest -g130000000 -P60 "5 --long"
-roundTripTest -g35000000 -P70 "8 --long"
-roundTripTest -g18000001 -P80  "18 --long"
+roundTripTest -g270000000 "1 --single-thread --long"
+roundTripTest -g130000000 -P60 "5 --single-thread --long"
+roundTripTest -g35000000 -P70 "8 --single-thread --long"
+roundTripTest -g18000001 -P80  "18 --single-thread --long"
 # Test large window logs
-roundTripTest -g700M -P50 "1 --long=29"
-roundTripTest -g600M -P50 "1 --long --zstd=wlog=29,clog=28"
+roundTripTest -g700M -P50 "1 --single-thread --long=29"
+roundTripTest -g600M -P50 "1 --single-thread --long --zstd=wlog=29,clog=28"


 if [ -n "$hasMT" ]