diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 74856823..9b743c34 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -801,10 +801,12 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
     </b>/* These parameters are only useful if multi-threading is enabled (ZSTD_MULTITHREAD).<b>
      * They return an error otherwise. */
     ZSTD_p_nbWorkers=400,    </b>/* Select how many threads will be spawned to compress in parallel.<b>
-                              * Triggers asynchronous mode, even with nbWorkers = 1.
-                              * Can only be set to a value >= 1 if ZSTD_MULTITHREAD is enabled.
-                              * More threads improve speed, but also increase memory usage.
-                              * Default value is `0`, aka "blocking mode" : no worker is spawned, compression is performed inside Caller's thread */
+                              * When nbWorkers >= 1, triggers asynchronous mode :
+                              * ZSTD_compress_generic() consumes some input, flush some output if possible, and immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation sets ZSTD_e_end : it becomes a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
     ZSTD_p_jobSize,          </b>/* Size of a compression job. This value is only enforced in streaming (non-blocking) mode.<b>
                               * Each compression job is completed in parallel, so indirectly controls the nb of active threads.
                               * 0 means default, which is dynamically determined based on compression parameters.
@@ -854,8 +856,10 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
 </b></pre><BR>
 <pre><b>size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 </b><p>  Set one compression parameter, selected by enum ZSTD_cParameter.
+  Setting a parameter is generally only possible during frame initialization (before starting compression),
+  except for a few exceptions which can be updated during compression: compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
   Note : when `value` is an enum, cast it to unsigned for proper type checking.
-  @result : informational value (typically, the one being set, possibly corrected),
+  @result : informational value (typically, value being set clamped correctly),
             or an error code (which can be tested with ZSTD_isError()). 
 </p></pre><BR>
 
@@ -1025,9 +1029,10 @@ size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t
 <pre><b>size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
 </b><p>  Apply a set of ZSTD_CCtx_params to the compression context.
-  This must be done before the dictionary is loaded.
-  The pledgedSrcSize is treated as unknown.
-  Multithreading parameters are applied only if nbWorkers >= 1.
+  This can be done even after compression is started,
+    if nbWorkers==0, this will have no impact until a new compression is started.
+    if nbWorkers>=1, new parameters will be picked up at next job,
+       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
  
 </p></pre><BR>
 
diff --git a/lib/common/fse_decompress.c b/lib/common/fse_decompress.c
index 8e3f0035..4c66c3b7 100644
--- a/lib/common/fse_decompress.c
+++ b/lib/common/fse_decompress.c
@@ -139,8 +139,8 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned
     {   U32 u;
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
-            U16 nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 0994d78f..4eca6b96 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -1247,32 +1247,44 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long
 
 
 #define ZSTD_ROWSIZE 16
-/*! ZSTD_reduceTable_internal() :
- *  reduce table indexes by `reducerValue`
- *  presume table size is a multiple of ZSTD_ROWSIZE.
- *  Helps auto-vectorization */
-static void ZSTD_reduceTable_internal (U32* const table, int const nbRows, U32 const reducerValue)
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
 {
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
     int cellNb = 0;
     int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
     for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
         int column;
         for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
             if (table[cellNb] < reducerValue) table[cellNb] = 0;
             else table[cellNb] -= reducerValue;
             cellNb++;
     }   }
 }
 
-/*! ZSTD_reduceTable() :
- *  reduce table indexes by `reducerValue` */
-static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue)
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
 {
-    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
-    assert(size < (1U<<31));   /* can be casted to int */
-    ZSTD_reduceTable_internal(table, size/ZSTD_ROWSIZE, reducerValue);
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
 }
 
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+
 /*! ZSTD_ldm_reduceTable() :
  *  reduce table indexes by `reducerValue` */
 static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
@@ -1297,8 +1309,9 @@ static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
     if (zc->appliedParams.cParams.strategy != ZSTD_fast) {
         U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog;
         if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2)
-            ZSTD_preserveUnsortedMark(ms->chainTable, chainSize, reducerValue);
-        ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
     }
 
     if (ms->hashLog3) {
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 22213c22..9da2cbfe 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -33,6 +33,12 @@ extern "C" {
 #define kSearchStrength      8
 #define HASH_READ_SIZE       8
 #define ZSTD_CLEVEL_CUSTOM 999
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index 1 now means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionnally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */
 
 
 /*-*************************************
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 7cb53b2e..0dab117a 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -15,35 +15,6 @@
 /*-*************************************
 *  Binary Tree search
 ***************************************/
-#define ZSTD_DUBT_UNSORTED_MARK 1   /* note : index 1 will now be confused with "unsorted" if sorted as larger than its predecessor.
-                                       It's not a big deal though : the candidate will just be considered unsorted, and be sorted again.
-                                       Additionnally, candidate position 1 will be lost.
-                                       But candidate 1 cannot hide a large tree of candidates, so it's a moderate loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled by a table re-use using a different strategy */
-
-/*! ZSTD_preserveUnsortedMark() :
- *  pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK before ZSTD_reduceTable()
- *  so that combined operation preserves its value.
- *  Without it, ZSTD_DUBT_UNSORTED_MARK==1 would be squashed to 0.
- *  As a consequence, the list of unsorted elements would stop on the first element,
- *  removing candidates, resulting in a negligible loss to compression ratio
- *  (since overflow protection with ZSTD_reduceTable() is relatively rare).
- *  Another potential risk is that a position will be promoted from *unsorted*
- *  to *sorted=>smaller:0*, meaning the next candidate will be considered smaller.
- *  This could be wrong, and result in data corruption.
- *  On second thought, this corruption might be impossible,
- *  because unsorted elements are always at the beginning of the list,
- *  and squashing to zero reduce the list to a single element,
- *  which needs to be sorted anyway.
- *  I haven't spent much thoughts into this possible scenario,
- *  and just felt it was safer to implement ZSTD_preserveUnsortedMark() */
-void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue)
-{
-    U32 u;
-    for (u=0; u<size; u++)
-        if (table[u] == ZSTD_DUBT_UNSORTED_MARK)
-            table[u] = ZSTD_DUBT_UNSORTED_MARK + reducerValue;
-}
 
 void ZSTD_updateDUBT(
                 ZSTD_matchState_t* ms, ZSTD_compressionParameters const* cParams,
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 6cbce9be..afae80d1 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -23,7 +23,7 @@
 
 /*!
 *  LEGACY_SUPPORT :
-*  if set to 1, ZSTD_decompress() can decode older formats (v0.1+)
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
 */
 #ifndef ZSTD_LEGACY_SUPPORT
 #  define ZSTD_LEGACY_SUPPORT 0
@@ -235,8 +235,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
 
 
 /*-*************************************************************
-*   Decompression section
-***************************************************************/
+ *   Frame header decoding
+ ***************************************************************/
 
 /*! ZSTD_isFrame() :
  *  Tells if the content of `buffer` starts with a valid Frame Identifier.
@@ -258,7 +258,7 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size)
 
 /** ZSTD_frameHeaderSize_internal() :
  *  srcSize must be large enough to reach header size fields.
- *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
  * @return : size of the Frame Header
  *           or an error code, which can be tested with ZSTD_isError() */
 static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
@@ -481,6 +481,10 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he
 }
 
 
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
 /*! ZSTD_getcBlockSize() :
 *   Provides the size of compressed block from block header `src` */
 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
@@ -651,6 +655,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
 typedef union {
     FSE_decode_t realData;
+    FSE_DTable dtable;
     U32 alignedBy4;
 } FSE_decode_t4;
 
@@ -729,7 +734,6 @@ static size_t ZSTD_buildSeqTable(FSE_DTable* DTableSpace, const FSE_DTable** DTa
                                  const void* src, size_t srcSize,
                                  const FSE_decode_t4* defaultTable, U32 flagRepeatTable)
 {
-    const void* const tmpPtr = defaultTable;   /* bypass strict aliasing */
     switch(type)
     {
     case set_rle :
@@ -739,7 +743,7 @@ static size_t ZSTD_buildSeqTable(FSE_DTable* DTableSpace, const FSE_DTable** DTa
         *DTablePtr = DTableSpace;
         return 1;
     case set_basic :
-        *DTablePtr = (const FSE_DTable*)tmpPtr;
+        *DTablePtr = &defaultTable->dtable;
         return 0;
     case set_repeat:
         if (!flagRepeatTable) return ERROR(corruption_detected);
@@ -936,13 +940,14 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
             ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
             ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
             assert(ofBits <= MaxOff);
-            if (MEM_32bits() && longOffsets) {
-                U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
+            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
                 offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
-                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
+                BIT_reloadDStream(&seqState->DStream);
                 if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
             } else {
-                offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                 if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
             }
         }
@@ -955,7 +960,7 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
                 if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
                 seqState->prevOffset[1] = seqState->prevOffset[0];
                 seqState->prevOffset[0] = offset = temp;
-            } else {
+            } else {  /* offset == 0 */
                 offset = seqState->prevOffset[0];
             }
         } else {
@@ -967,16 +972,16 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l
     }
 
     seq.matchLength = ML_base[mlCode]
-                    + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+                    + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0);  /* <=  16 bits */
     if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
         BIT_reloadDStream(&seqState->DStream);
     if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
         BIT_reloadDStream(&seqState->DStream);
-    /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
+    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
     ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
     seq.litLength = LL_base[llCode]
-                  + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+                  + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0);    /* <=  16 bits */
     if (MEM_32bits())
         BIT_reloadDStream(&seqState->DStream);
 
@@ -1364,13 +1369,13 @@ static size_t ZSTD_decompressSequencesLong(
         FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
 
         /* prepare in advance */
-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb<seqAdvance; seqNb++) {
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
             sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
         }
         if (seqNb<seqAdvance) return ERROR(corruption_detected);
 
         /* decode and decompress */
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb<nbSeq ; seqNb++) {
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
             seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
             size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
@@ -1411,13 +1416,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     /* isLongOffset must be true if there are long offsets.
      * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
      * We don't expect that to be the case in 64-bit mode.
-     * If we are in block mode we don't know the window size, so we have to be
-     * conservative.
+     * In block mode, window size is not known, so we have to be conservative. (note: but it could be evaluated from current-lowLimit)
      */
     ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)));
-    /* windowSize could be any value at this point, since it is only validated
-     * in the streaming API.
-     */
     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
 
     if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);
@@ -1429,7 +1430,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += litCSize;
         srcSize -= litCSize;
     }
-    if (frame && dctx->fParams.windowSize > (1<<23))
+    if ( frame /* windowSize exists */
+      && (dctx->fParams.windowSize > (1<<24))
+      && MEM_64bits() /* x86 benefits less from long mode than x64 */ )
         return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, isLongOffset);
     return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, isLongOffset);
 }