Merge branch 'dev' into ahmed_file

2019-11-26 11:20:26 -08:00 · 2019-11-26 11:20:26 -08:00 · c71bd45a3b
commit c71bd45a3b
parent 5e657aca90 d6e0a44576
52 changed files with 1614 additions and 391 deletions
--- a/.gitignore
+++ b/.gitignore
@ -31,6 +31,7 @@ projects/
 bin/
 .buckd/
 buck-out/
+build-*

 # Other files
 .directory
@ -43,3 +44,5 @@ _zstdbench/
 googletest/
 *.d
 *.vscode
+compile_commands.json
+.clangd
--- a/.travis.yml
+++ b/.travis.yml
@ -32,9 +32,10 @@ matrix:
      script:
        - make check

-    - name: Trusty (Test All)
+    - name: make test (complete)
      script:
-        - make test
+        # DEVNULLRIGHTS : will request sudo rights to test permissions on /dev/null
+        - DEVNULLRIGHTS=test make test

    - name: gcc-6 + gcc-7 compilation
      script:
@ -80,43 +81,43 @@ matrix:
        - make clean
        - make -j check MOREFLAGS="-Werror -DZSTD_NO_INLINE -DZSTD_STRIP_ERROR_STRINGS"

-    - name: Trusty (CMake)
+    - name: cmake test
      script:
        - make cmakebuild

-    - name: Trusty (Static Analyze)
+    - name: static analyzer scanbuild
      script:
        - make staticAnalyze

-    - name: Trusty (gcc-8 + ASan + UBSan + Fuzz Test)
+    - name: gcc-8 + ASan + UBSan + Fuzz Test
      script:
        - make gcc8install
        - CC=gcc-8 make clean uasan-fuzztest

-    - name: Trusty (gcc-6 + ASan + UBSan + Fuzz Test 32bit)
+    - name: gcc-6 + ASan + UBSan + Fuzz Test 32bit
      script:
        - make gcc6install libc6install
        - CC=gcc-6 CFLAGS="-O2 -m32" make uasan-fuzztest   # can complain about pointer overflow

-    - name: Trusty (clang-3.8 + MSan + Fuzz Test)
+    - name: clang-3.8 + MSan + Fuzz Test
      script:
        - make clang38install
        - CC=clang-3.8 make clean msan-fuzztest

-    - name: Trusty (ASan + UBSan + MSan + Regression Test)
+    - name: ASan + UBSan + MSan + Regression Test
      script:
        - make -j uasanregressiontest
        - make clean
        - make -j msanregressiontest

-    - name: Trusty (Valgrind + Fuzz Test Stack Mode)
+    - name: Valgrind + Fuzz Test Stack Mode
      script:
        - make valgrindinstall
        - make -C tests clean valgrindTest
        - make clean
        - make -C tests test-fuzzer-stackmode

-    - name: Trusty (ARM + Fuzz Test)
+    - name: Qemu ARM emulation + Fuzz Test
      script:
        - make arminstall
        - make armfuzz
@ -127,12 +128,12 @@ matrix:
        - make arminstall
        - make aarch64fuzz

-    - name: Trusty (PPC + Fuzz Test)
+    - name: PPC + Fuzz Test
      script:
        - make ppcinstall
        - make ppcfuzz

-    - name: Trusty (Versions Compatibility Test)
+    - name: Versions Compatibility Test
      script:
        - make -C tests versionsTest

--- a/4
+++ b/4
@ -351,9 +351,9 @@ cmakebuild:
 	mkdir $(BUILDIR)/cmake/build
 	cd $(BUILDIR)/cmake/build ; cmake -DCMAKE_INSTALL_PREFIX:PATH=~/install_test_dir $(CMAKE_PARAMS) .. ; $(MAKE) install ; $(MAKE) uninstall

-c90build: clean
+c89build: clean
 	$(CC) -v
-	CFLAGS="-std=c90 -Werror" $(MAKE) allmost  # will fail, due to missing support for `long long`
+	CFLAGS="-std=c89 -Werror" $(MAKE) allmost  # will fail, due to missing support for `long long`

 gnu90build: clean
 	$(CC) -v
--- a/build/VS2008/fullbench/fullbench.vcproj
+++ b/build/VS2008/fullbench/fullbench.vcproj
@ -372,6 +372,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_compress_sequences.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstdmt_compress.c"
 				>
@ -514,6 +518,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_cwksp.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstd_fast.h"
 				>
--- a/build/VS2008/fuzzer/fuzzer.vcproj
+++ b/build/VS2008/fuzzer/fuzzer.vcproj
@ -420,6 +420,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_compress_sequences.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\decompress\zstd_decompress.c"
 				>
@ -550,6 +554,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_cwksp.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstd_fast.h"
 				>
--- a/build/VS2008/zstd/zstd.vcproj
+++ b/build/VS2008/zstd/zstd.vcproj
@ -432,6 +432,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_compress_sequences.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\decompress\zstd_decompress.c"
 				>
@ -630,6 +634,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_cwksp.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstd_fast.h"
 				>
--- a/build/VS2008/zstdlib/zstdlib.vcproj
+++ b/build/VS2008/zstdlib/zstdlib.vcproj
@ -404,6 +404,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_compress_sequences.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.c"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstd_fast.c"
 				>
@ -562,6 +566,10 @@
 				RelativePath="..\..\..\lib\compress\zstd_cwksp.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\lib\compress\zstd_compress_superblock.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\lib\compress\zstd_fast.h"
 				>
--- a/build/VS2010/fullbench/fullbench.vcxproj
+++ b/build/VS2010/fullbench/fullbench.vcxproj
@ -169,6 +169,7 @@
    <ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_literals.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_sequences.c" />
+    <ClCompile Include="..\..\..\lib\compress\zstd_compress_superblock.c" />
    <ClCompile Include="..\..\..\lib\compress\zstdmt_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_double_fast.c" />
@ -198,6 +199,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_literals.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_sequences.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_cwksp.h" />
+    <ClInclude Include="..\..\..\lib\compress\zstd_compress_superblock.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_double_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_lazy.h" />
--- a/build/VS2010/fuzzer/fuzzer.vcxproj
+++ b/build/VS2010/fuzzer/fuzzer.vcxproj
@ -169,6 +169,7 @@
    <ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_literals.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_sequences.c" />
+    <ClCompile Include="..\..\..\lib\compress\zstd_compress_superblock.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_double_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_lazy.c" />
@ -201,6 +202,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_literals.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_sequences.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_cwksp.h" />
+    <ClInclude Include="..\..\..\lib\compress\zstd_compress_superblock.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_double_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_lazy.h" />
--- a/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
+++ b/build/VS2010/libzstd-dll/libzstd-dll.vcxproj
@ -33,6 +33,7 @@
    <ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_literals.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_sequences.c" />
+    <ClCompile Include="..\..\..\lib\compress\zstd_compress_superblock.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_double_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_lazy.c" />
@ -83,6 +84,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_literals.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_sequences.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_cwksp.h" />
+    <ClInclude Include="..\..\..\lib\compress\zstd_compress_superblock.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_double_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_lazy.h" />
--- a/build/VS2010/libzstd/libzstd.vcxproj
+++ b/build/VS2010/libzstd/libzstd.vcxproj
@ -33,6 +33,7 @@
    <ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_literals.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_sequences.c" />
+    <ClCompile Include="..\..\..\lib\compress\zstd_compress_superblock.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_double_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_lazy.c" />
@ -83,6 +84,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_literals.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_sequences.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_cwksp.h" />
+    <ClInclude Include="..\..\..\lib\compress\zstd_compress_superblock.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_double_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_lazy.h" />
--- a/build/VS2010/zstd/zstd.vcxproj
+++ b/build/VS2010/zstd/zstd.vcxproj
@ -34,6 +34,7 @@
    <ClCompile Include="..\..\..\lib\compress\zstd_compress.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_literals.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_compress_sequences.c" />
+    <ClCompile Include="..\..\..\lib\compress\zstd_compress_superblock.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_double_fast.c" />
    <ClCompile Include="..\..\..\lib\compress\zstd_lazy.c" />
@ -80,6 +81,7 @@
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_literals.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_compress_sequences.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_cwksp.h" />
+    <ClInclude Include="..\..\..\lib\compress\zstd_compress_superblock.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_double_fast.h" />
    <ClInclude Include="..\..\..\lib\compress\zstd_lazy.h" />
--- a/build/cmake/tests/CMakeLists.txt
+++ b/build/cmake/tests/CMakeLists.txt
@ -49,6 +49,9 @@ target_link_libraries(fullbench libzstd_static)
 add_executable(fuzzer ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/fuzzer.c)
 target_link_libraries(fuzzer libzstd_static)

+add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c)
+target_link_libraries(zstreamtest libzstd_static)
+
 if (UNIX)
    add_executable(paramgrill ${PROGRAMS_DIR}/benchfn.c ${PROGRAMS_DIR}/benchzstd.c ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/paramgrill.c)
    target_link_libraries(paramgrill libzstd_static m) #m is math library
--- a/build/meson/lib/meson.build
+++ b/build/meson/lib/meson.build
@ -30,6 +30,7 @@ libzstd_sources = [join_paths(zstd_rootdir, 'lib/common/entropy_common.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstd_compress.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstd_compress_literals.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstd_compress_sequences.c'),
+  join_paths(zstd_rootdir, 'lib/compress/zstd_compress_superblock.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstdmt_compress.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstd_fast.c'),
  join_paths(zstd_rootdir, 'lib/compress/zstd_double_fast.c'),
--- a/doc/zstd_compression_format.md
+++ b/doc/zstd_compression_format.md
@ -16,7 +16,7 @@ Distribution of this document is unlimited.

 ### Version

-0.3.4 (16/08/19)
+0.3.5 (13/11/19)


 Introduction
@ -341,6 +341,8 @@ The structure of a block is as follows:
 |:--------------:|:---------------:|
 |    3 bytes     |     n bytes     |

+__`Block_Header`__
+
 `Block_Header` uses 3 bytes, written using __little-endian__ convention.
 It contains 3 fields :

@ -385,17 +387,30 @@ There are 4 block types :
 __`Block_Size`__

 The upper 21 bits of `Block_Header` represent the `Block_Size`.
+
 When `Block_Type` is `Compressed_Block` or `Raw_Block`,
-`Block_Size` is the size of `Block_Content`, hence excluding `Block_Header`.  
-When `Block_Type` is `RLE_Block`, `Block_Content`’s size is always 1,
-and `Block_Size` represents the number of times this byte must be repeated.
-A block can contain and decompress into any number of bytes (even zero),
-up to `Block_Maximum_Decompressed_Size`, which is the smallest of:
-  Window_Size
+`Block_Size` is the size of `Block_Content` (hence excluding `Block_Header`).  
+
+When `Block_Type` is `RLE_Block`, since `Block_Content`’s size is always 1,
+`Block_Size` represents the number of times this byte must be repeated.
+
+`Block_Size` is limited by `Block_Maximum_Size` (see below).
+
+__`Block_Content`__ and __`Block_Maximum_Size`__
+
+The size of `Block_Content` is limited by `Block_Maximum_Size`,
+which is the smallest of:
+-  `Window_Size`
 -  128 KB

-If this condition cannot be respected when generating a `Compressed_Block`,
-the block must be sent uncompressed instead (`Raw_Block`).
+`Block_Maximum_Size` is constant for a given frame.
+This maximum is applicable to both the decompressed size
+and the compressed size of any block in the frame.
+
+The reasoning for this limit is that a decoder can read this information
+at the beginning of a frame and use it to allocate buffers.
+The guarantees on the size of blocks ensure that
+the buffers will be large enough for any following block of the valid frame.


 Compressed Blocks
@ -1658,6 +1673,7 @@ or at least provide a meaningful error code explaining for which reason it canno

 Version changes
 ---------------
+- 0.3.5 : clarifications for Block_Maximum_Size
 - 0.3.4 : clarifications for FSE decoding table
 - 0.3.3 : clarifications for field Block_Size
 - 0.3.2 : remove additional block size restriction on compressed blocks
--- a/lib/Makefile
+++ b/lib/Makefile
@ -161,7 +161,7 @@ ifneq (,$(filter Windows%,$(OS)))
 LIBZSTD = dll\libzstd.dll
 $(LIBZSTD): $(ZSTD_FILES)
 	@echo compiling dynamic library $(LIBVER)
-	$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -Wl,--out-implib,dll\libzstd.lib -shared $^ -o $@
+	$(CC) $(FLAGS) -DZSTD_DLL_EXPORT=1 -Wl,--out-implib,dll\libzstd.dll.a -shared $^ -o $@

 else

--- a/lib/common/bitstream.h
+++ b/lib/common/bitstream.h
@ -48,6 +48,7 @@ extern "C" {
 *  Dependencies
 ******************************************/
 #include "mem.h"            /* unaligned access routines */
+#include "compiler.h"       /* UNLIKELY() */
 #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */

@ -411,6 +412,23 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
    return value;
 }

+/*! BIT_reloadDStreamFast() :
+ *  Similar to BIT_reloadDStream(), but with two differences:
+ *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ *     point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+    if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+        return BIT_DStream_overflow;
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
 /*! BIT_reloadDStream() :
 *  Refill `bitD` from buffer previously set in BIT_initDStream() .
 *  This function is safe, it guarantees it will not read beyond src buffer.
@ -422,10 +440,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
        return BIT_DStream_overflow;

    if (bitD->ptr >= bitD->limitPtr) {
-        bitD->ptr -= bitD->bitsConsumed >> 3;
-        bitD->bitsConsumed &= 7;
-        bitD->bitContainer = MEM_readLEST(bitD->ptr);
-        return BIT_DStream_unfinished;
+        return BIT_reloadDStreamFast(bitD);
    }
    if (bitD->ptr == bitD->start) {
        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@ -146,6 +146,19 @@
 #  define DONT_VECTORIZE
 #endif

+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
 #  include <intrin.h>                    /* For Visual 2005 */
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@ -208,6 +208,7 @@ typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
 size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);

 typedef enum {
   HUF_repeat_none,  /**< Cannot use the previous table */
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@ -427,7 +427,7 @@ size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbo
    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
 }

-static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
 {
    size_t nbBits = 0;
    int s;
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@ -28,11 +28,19 @@
 #include "zstd_lazy.h"
 #include "zstd_opt.h"
 #include "zstd_ldm.h"
+#include "zstd_compress_superblock.h"


 /*-*************************************
 *  Helper functions
 ***************************************/
+/* ZSTD_compressBound()
+ * Note that the result from this function is only compatible with the "normal"
+ * full-block strategy.
+ * When there are a lot of small blocks due to frequent flush in streaming mode
+ * or targetCBlockSize, the overhead of headers can make the compressed data to
+ * be larger than the return value of ZSTD_compressBound().
+ */
 size_t ZSTD_compressBound(size_t srcSize) {
    return ZSTD_COMPRESSBOUND(srcSize);
 }
@ -249,12 +257,12 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
 /* ZSTD_assignParamsToCCtxParams() :
 * params is presumed valid at this stage */
 static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams(
-        const ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+        const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
 {
    ZSTD_CCtx_params ret = *cctxParams;
-    assert(!ZSTD_checkCParams(params.cParams));
-    ret.cParams = params.cParams;
-    ret.fParams = params.fParams;
+    assert(!ZSTD_checkCParams(params->cParams));
+    ret.cParams = params->cParams;
+    ret.fParams = params->fParams;
    ret.compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
    return ret;
 }
@ -339,8 +347,13 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
        return bounds;

    case ZSTD_c_overlapLog:
+#ifdef ZSTD_MULTITHREAD
        bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
        bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
+#else
+        bounds.lowerBound = 0;
+        bounds.upperBound = 0;
+#endif
        return bounds;

    case ZSTD_c_enableLongDistanceMatching:
@ -845,7 +858,7 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 {
    ZSTD_localDict* const dl = &cctx->localDict;
    ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(
-            &cctx->requestedParams, 0, dl->dictSize);
+            &cctx->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN, dl->dictSize);
    if (dl->dict == NULL) {
        /* No local dictionary. */
        assert(dl->dictBuffer == NULL);
@ -1006,7 +1019,7 @@ static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
 *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
 *  mostly downsize to reduce memory consumption and initialization latency.
 * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
- *  note : for the time being, `srcSize==0` means "unknown" too, for compatibility with older convention.
+ *  note : `srcSize==0` means 0!
 *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
 static ZSTD_compressionParameters
 ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
@ -1017,10 +1030,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
    static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
    assert(ZSTD_checkCParams(cPar)==0);

-    if (dictSize && (srcSize+1<2) /* ZSTD_CONTENTSIZE_UNKNOWN and 0 mean "unknown" */ )
-        srcSize = minSrcSize;  /* presumed small when there is a dictionary */
-    else if (srcSize == 0)
-        srcSize = ZSTD_CONTENTSIZE_UNKNOWN;  /* 0 == unknown : presumed large */
+    if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+        srcSize = minSrcSize;

    /* resize windowLog if input is small enough, to use less memory */
    if ( (srcSize < maxWindowResize)
@ -1049,9 +1060,13 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
                   size_t dictSize)
 {
    cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+    if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
 }

+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+
 ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
 {
@ -1059,7 +1074,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
    if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
      srcSizeHint = CCtxParams->srcSizeHint;
    }
-    cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize);
    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
@ -1069,6 +1084,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
    assert(!ZSTD_checkCParams(cParams));
+    /* srcSizeHint == 0 means 0 */
    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
 }

@ -1104,7 +1120,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 {
    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
    {   ZSTD_compressionParameters const cParams =
-                ZSTD_getCParamsFromCCtxParams(params, 0, 0);
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
        U32    const divider = (cParams.minMatch==3) ? 3 : 4;
        size_t const maxNbSeq = blockSize / divider;
@ -1136,7 +1152,7 @@ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)

 static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
 {
-    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
    return ZSTD_estimateCCtxSize_usingCParams(cParams);
 }

@ -1155,7 +1171,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 {
    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
    {   ZSTD_compressionParameters const cParams =
-                ZSTD_getCParamsFromCCtxParams(params, 0, 0);
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
        size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
        size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize;
@ -1175,7 +1191,7 @@ size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)

 static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
 {
-    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0);
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
    return ZSTD_estimateCStreamSize_usingCParams(cParams);
 }

@ -1243,7 +1259,7 @@ static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
    assert(cParams1.strategy     == cParams2.strategy);
 }

-static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
 {
    int i;
    for (i = 0; i < ZSTD_REP_NUM; ++i)
@ -1320,10 +1336,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,

    DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
    if (forceResetIndex == ZSTDirp_reset) {
-        memset(&ms->window, 0, sizeof(ms->window));
-        ms->window.dictLimit = 1;    /* start from 1, so that 1st position is valid */
-        ms->window.lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
-        ms->window.nextSrc = ms->window.base + 1;   /* see issue #1241 */
+        ZSTD_window_init(&ms->window);
        ZSTD_cwksp_mark_tables_dirty(ws);
    }

@ -1416,7 +1429,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
        size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);

-        ZSTD_indexResetPolicy_e needsIndexReset = ZSTDirp_continue;
+        ZSTD_indexResetPolicy_e needsIndexReset = zc->initialized ? ZSTDirp_continue : ZSTDirp_reset;

        if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) {
            needsIndexReset = ZSTDirp_reset;
@ -1541,11 +1554,12 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
            zc->maxNbLdmSequences = maxNbLdmSeq;

-            memset(&zc->ldmState.window, 0, sizeof(zc->ldmState.window));
+            ZSTD_window_init(&zc->ldmState.window);
            ZSTD_window_clear(&zc->ldmState.window);
        }

        DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+        zc->initialized = 1;

        return 0;
    }
@ -1603,6 +1617,7 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
        assert(windowLog != 0);
        /* Resize working context table params for input only, since the dict
         * has its own tables. */
+        /* pledgeSrcSize == 0 means 0! */
        params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
        params.cParams.windowLog = windowLog;
        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
@ -1889,16 +1904,6 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par

 /* See doc/zstd_compression_format.md for detailed format description */

-static size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
-{
-    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
-    RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
-                    dstSize_tooSmall);
-    MEM_writeLE24(dst, cBlockHeader24);
-    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
-    return ZSTD_blockHeaderSize + srcSize;
-}
-
 void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
 {
    const seqDef* const sequences = seqStorePtr->sequencesStart;
@ -1936,6 +1941,16 @@ static int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
    }
 }

+/* ZSTD_useTargetCBlockSize():
+ * Returns if target compressed block size param is being used.
+ * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
+    return (cctxParams->targetCBlockSize != 0);
+}
+
 /* ZSTD_compressSequences_internal():
 * actually compresses both literals and sequences */
 MEM_STATIC size_t
@ -2435,6 +2450,70 @@ out:
    return cSize;
 }

+static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               U32 lastBlock) {
+    size_t cSize = 0;
+    DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss);
+        if (bss == ZSTDbss_compress) {
+            cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, lastBlock);
+    }   }
+
+    /* Superblock compression may fail, in which case
+     * encode using ZSTD_noCompressSuperBlock writing sub blocks
+     * in uncompressed mode.
+     */
+    if (cSize == 0) {
+        cSize = ZSTD_noCompressSuperBlock(dst, dstCapacity, src, srcSize, zc->appliedParams.targetCBlockSize, lastBlock);
+        /* In compression, there is an assumption that a compressed block is always
+         * within the size of ZSTD_compressBound(). However, SuperBlock compression
+         * can exceed the limit due to overhead of headers from SubBlocks.
+         * This breaks in streaming mode where output buffer in compress context is
+         * allocated ZSTD_compressBound() amount of memory, which may not be big
+         * enough for SuperBlock compression.
+         * In such case, fall back to normal compression. This is possible because
+         * targetCBlockSize is best effort not a guarantee. */
+        if (cSize != ERROR(dstSize_tooSmall)) return cSize;
+        else {
+            BYTE* const ostart = (BYTE*)dst;
+            /* If ZSTD_noCompressSuperBlock fails with dstSize_tooSmall,
+             * compress normally.
+             */
+            cSize = ZSTD_compressSequences(&zc->seqStore,
+                    &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+                    &zc->appliedParams,
+                    ostart+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                    srcSize,
+                    zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+                    zc->bmi2);
+            if (!ZSTD_isError(cSize) && cSize != 0) {
+                U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                MEM_writeLE24(ostart, cBlockHeader24);
+                cSize += ZSTD_blockHeaderSize;
+            }
+        }
+    }
+
+    if (!ZSTD_isError(cSize) && cSize != 0) {
+        /* confirm repcodes and entropy tables when emitting a compressed block */
+        ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+        zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+        zc->blockState.nextCBlock = tmp;
+    }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}

 static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
                                         ZSTD_cwksp* ws,
@ -2500,20 +2579,29 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
        /* Ensure hash/chain table insertion resumes no sooner than lowlimit */
        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;

-        {   size_t cSize = ZSTD_compressBlock_internal(cctx,
+        {   size_t cSize;
+            int useTargetCBlockSize = ZSTD_useTargetCBlockSize(&cctx->appliedParams);
+            if (useTargetCBlockSize) {
+                cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize);
+            } else {
+                cSize = ZSTD_compressBlock_internal(cctx,
                                        op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
                                        ip, blockSize, 1 /* frame */);
                FORWARD_IF_ERROR(cSize);
+
                if (cSize == 0) {  /* block is not compressible */
                    cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
                    FORWARD_IF_ERROR(cSize);
                } else {
-                const U32 cBlockHeader = cSize == 1 ?
+                    U32 const cBlockHeader = cSize == 1 ?
                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
                    MEM_writeLE24(op, cBlockHeader);
                    cSize += ZSTD_blockHeaderSize;
                }
+            }
+

            ip += blockSize;
            assert(remaining >= blockSize);
@ -2763,37 +2851,13 @@ static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSym
    return 0;
 }

-
-/* Dictionary format :
- * See :
- * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
- */
-/*! ZSTD_loadZstdDictionary() :
- * @return : dictID, or an error code
- *  assumptions : magic number supposed already checked
- *                dictSize supposed >= 8
- */
-static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
-                                      ZSTD_matchState_t* ms,
-                                      ZSTD_cwksp* ws,
-                                      ZSTD_CCtx_params const* params,
-                                      const void* dict, size_t dictSize,
-                                      ZSTD_dictTableLoadMethod_e dtlm,
-                                      void* workspace)
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         short* offcodeNCount, unsigned* offcodeMaxValue,
+                         const void* const dict, size_t dictSize) 
 {
-    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
    const BYTE* const dictEnd = dictPtr + dictSize;
-    short offcodeNCount[MaxOff+1];
-    unsigned offcodeMaxValue = MaxOff;
-    size_t dictID;
-
-    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
-    assert(dictSize >= 8);
-    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
-
-    dictPtr += 4;   /* skip magic number */
-    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr);
-    dictPtr += 4;
+    dictPtr += 8;

    {   unsigned maxSymbolValue = 255;
        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr);
@ -2803,7 +2867,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
    }

    {   unsigned offcodeLog;
-        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted);
        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted);
        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
@ -2852,6 +2916,42 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
    bs->rep[2] = MEM_readLE32(dictPtr+8);
    dictPtr += 12;

+    return dictPtr - (const BYTE*)dict;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed >= 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_cwksp* ws,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    size_t dictID;
+    size_t eSize;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize >= 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
+    eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
+    FORWARD_IF_ERROR(eSize);
+    dictPtr += eSize;
+
    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
        U32 offcodeMax = MaxOff;
        if (dictContentSize <= ((U32)-1) - 128 KB) {
@ -2986,7 +3086,7 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
 {
    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
+            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
    return ZSTD_compressBegin_advanced_internal(cctx,
                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
                                            NULL /*cdict*/,
@ -2995,9 +3095,9 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,

 size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
    ZSTD_CCtx_params const cctxParams =
-            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
+            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                                       &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
@ -3081,7 +3181,7 @@ static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize,
                                const void* dict,size_t dictSize,
-                                      ZSTD_parameters params)
+                                const ZSTD_parameters* params)
 {
    ZSTD_CCtx_params const cctxParams =
            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
@ -3105,7 +3205,7 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                  dst, dstCapacity,
                                  src, srcSize,
                                  dict, dictSize,
-                                  params);
+                                  &params);
 }

 /* Internal */
@ -3129,8 +3229,8 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
                         const void* dict, size_t dictSize,
                               int compressionLevel)
 {
-    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize + (!srcSize), dict ? dictSize : 0);
-    ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
+    ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0);
+    ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
    assert(params.fParams.contentSizeFlag == 1);
    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams);
 }
@ -3176,7 +3276,7 @@ size_t ZSTD_estimateCDictSize_advanced(

 size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
 {
-    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
 }

@ -3287,7 +3387,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,

 ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
    ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize,
                                                  ZSTD_dlm_byCopy, ZSTD_dct_auto,
                                                  cParams, ZSTD_defaultCMem);
@ -3298,7 +3398,7 @@ ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionL

 ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
 {
-    ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize);
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
    return ZSTD_createCDict_advanced(dict, dictSize,
                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
                                     cParams, ZSTD_defaultCMem);
@ -3590,7 +3690,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) );
    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) );
    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) );
-    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, params);
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, &params);
    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) );
    return 0;
 }
@ -3655,11 +3755,11 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                          ZSTD_EndDirective const flushMode)
 {
    const char* const istart = (const char*)input->src;
-    const char* const iend = istart + input->size;
-    const char* ip = istart + input->pos;
+    const char* const iend = input->size != 0 ? istart + input->size : istart;
+    const char* ip = input->pos != 0 ? istart + input->pos : istart;
    char* const ostart = (char*)output->dst;
-    char* const oend = ostart + output->size;
-    char* op = ostart + output->pos;
+    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+    char* op = output->pos != 0 ? ostart + output->pos : ostart;
    U32 someMoreWork = 1;

    /* check expectations */
@ -3698,6 +3798,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                        zcs->inBuff + zcs->inBuffPos, toLoad,
                                        ip, iend-ip);
                zcs->inBuffPos += loaded;
+                if (loaded != 0)
                    ip += loaded;
                if ( (flushMode == ZSTD_e_continue)
                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
@ -3758,6 +3859,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
                            (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed);
+                if (flushed)
                    op += flushed;
                zcs->outBuffFlushedSize += flushed;
                if (toFlush!=flushed) {
@ -4069,35 +4171,56 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV
 },
 };

-/*! ZSTD_getCParams() :
+/*! ZSTD_getCParams_internal() :
 * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
- *  Size values are optional, provide 0 if not known or unused */
-ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+ *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+ *        Use dictSize == 0 for unknown or unused. */
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
 {
-    size_t const addedSize = srcSizeHint ? 0 : 500;
-    U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : ZSTD_CONTENTSIZE_UNKNOWN;  /* intentional overflow for srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN */
+    int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
+    size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
+    U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize;
    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
    int row = compressionLevel;
-    DEBUGLOG(5, "ZSTD_getCParams (cLevel=%i)", compressionLevel);
+    DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
    if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
    if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
        if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel);   /* acceleration factor */
-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize);               /* refine parameters based on srcSize & dictSize */
+        /* refine parameters based on srcSize & dictSize */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize);
    }
 }

+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}
+
 /*! ZSTD_getParams() :
 *  same idea as ZSTD_getCParams()
 * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
 *  Fields of `ZSTD_frameParameters` are set to default values */
 ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
-    ZSTD_parameters params;
-    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSizeHint, dictSize);
-    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
-    memset(&params, 0, sizeof(params));
-    params.cParams = cParams;
-    params.fParams.contentSizeFlag = 1;
-    return params;
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize);
 }
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@ -249,6 +249,7 @@ struct ZSTD_CCtx_s {
    size_t staticSize;
    SeqCollector seqCollector;
    int isFirstBlock;
+    int initialized;

    seqStore_t seqStore;      /* sequences storage ptrs */
    ldmState_t ldmState;      /* long distance matching state */
@ -336,6 +337,20 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
    return 1;
 }

+/* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+{
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                    dstSize_tooSmall);
+    MEM_writeLE24(dst, cBlockHeader24);
+    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    return ZSTD_blockHeaderSize + srcSize;
+}
+
+
 /* ZSTD_minGain() :
 * minimum compression required
 * to generate a compress block or a compressed literals section.
@ -844,6 +859,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
    }   }   }
 }

+MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+    memset(window, 0, sizeof(*window));
+    window->base = (BYTE const*)"";
+    window->dictBase = (BYTE const*)"";
+    window->dictLimit = 1;    /* start from 1, so that 1st position is valid */
+    window->lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
+    window->nextSrc = window->base + 1;   /* see issue #1241 */
+}
+
 /**
 * ZSTD_window_update():
 * Updates the window by appending [src, src + srcSize) to the window.
@ -857,6 +881,10 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
    BYTE const* const ip = (BYTE const*)src;
    U32 contiguous = 1;
    DEBUGLOG(5, "ZSTD_window_update");
+    if (srcSize == 0)
+        return contiguous;
+    assert(window->base != NULL);
+    assert(window->dictBase != NULL);
    /* Check if blocks follow each other */
    if (src != window->nextSrc) {
        /* not contiguous */
@ -931,6 +959,21 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 }
 #endif

+/* ===============================================================
+ * Shared internal declarations
+ * These prototypes may be called from sources not in lib/compress
+ * =============================================================== */
+
+/* ZSTD_loadCEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * return : size of dictionary header (size of magic number + dict ID + entropy tables)
+ * assumptions : magic number supposed already checked
+ *               and dictSize >= 8 */
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         short* offcodeNCount, unsigned* offcodeMaxValue,
+                         const void* const dict, size_t dictSize);
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);

 /* ==============================================================
 * Private declarations
@ -940,6 +983,7 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 /* ZSTD_getCParamsFromCCtxParams() :
 * cParams are built depending on compressionLevel, src size hints,
 * LDM and manually set compression parameters.
+ * Note: srcSizeHint == 0 means 0!
 */
 ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
--- a/lib/compress/zstd_compress_literals.c
+++ b/lib/compress/zstd_compress_literals.c
@ -102,11 +102,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
        cLitSize = singleStream ?
            HUF_compress1X_repeat(
                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                255, 11, entropyWorkspace, entropyWorkspaceSize,
+                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
            HUF_compress4X_repeat(
                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                255, 11, entropyWorkspace, entropyWorkspaceSize,
+                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
        if (repeat != HUF_repeat_none) {
            /* reused the existing table */
--- a/lib/compress/zstd_compress_sequences.c
+++ b/lib/compress/zstd_compress_sequences.c
@ -86,7 +86,7 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t
 * Returns the cost in bits of encoding the distribution in count using ctable.
 * Returns an error if ctable cannot represent all the symbols in count.
 */
-static size_t ZSTD_fseBitCost(
+size_t ZSTD_fseBitCost(
    FSE_CTable const* ctable,
    unsigned const* count,
    unsigned const max)
@ -117,7 +117,7 @@ static size_t ZSTD_fseBitCost(
 * table described by norm. The max symbol support by norm is assumed >= max.
 * norm must be valid for every symbol with non-zero probability in count.
 */
-static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
                             unsigned const* count, unsigned const max)
 {
    unsigned const shift = 8 - accuracyLog;
--- a/lib/compress/zstd_compress_sequences.h
+++ b/lib/compress/zstd_compress_sequences.h
@ -44,4 +44,11 @@ size_t ZSTD_encodeSequences(
            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);

+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max);
 #endif /* ZSTD_COMPRESS_SEQUENCES_H */
--- a/lib/compress/zstd_compress_superblock.c
+++ b/lib/compress/zstd_compress_superblock.c
@ -0,0 +1,742 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+#include "hist.h"           /* HIST_countFast_wksp */
+#include "zstd_compress_internal.h"
+#include "zstd_compress_sequences.h"
+#include "zstd_compress_literals.h"
+#include "zstd_compress_superblock.h"
+
+/*-*************************************
+*  Superblock entropy buffer structs
+***************************************/
+/** ZSTD_hufCTablesMetadata_t :
+ *  Stores Literals Block Type for a super-block in hType, and
+ *  huffman tree description in hufDesBuffer.
+ *  hufDesSize refers to the size of huffman tree description in bytes.
+ *  This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */
+typedef struct {
+    symbolEncodingType_e hType;
+    BYTE hufDesBuffer[500]; // TODO give name to this value
+    size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/** ZSTD_fseCTablesMetadata_t :
+ *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
+ *  fse tables in fseTablesBuffer.
+ *  fseTablesSize refers to the size of fse tables in bytes.
+ *  This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */
+typedef struct {
+    symbolEncodingType_e llType;
+    symbolEncodingType_e ofType;
+    symbolEncodingType_e mlType;
+    BYTE fseTablesBuffer[500]; // TODO give name to this value
+    size_t fseTablesSize;
+    size_t lastCountSize; // This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences()
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+    ZSTD_hufCTablesMetadata_t hufMetadata;
+    ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+
+/** ZSTD_buildSuperBlockEntropy_literal() :
+ *  Builds entropy for the super-block literals.
+ *  Stores literals block type (raw, rle, compressed) and
+ *  huffman description table to hufMetadata.
+ *  Currently, this does not consider the option of reusing huffman table from
+ *  previous super-block. I think it would be a good improvement to add that option.
+ *  @return : size of huffman description table or error code */
+static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize,
+                                            const ZSTD_hufCTables_t* prevHuf,
+                                                  ZSTD_hufCTables_t* nextHuf,
+                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                  void* workspace, size_t wkspSize)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+    BYTE* const nodeWksp = countWkspStart + countWkspSize;
+    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    unsigned maxSymbolValue = 255;
+    unsigned huffLog = 11;
+
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) { hufMetadata->hType = set_basic; return 0; }
+    }
+
+    /* Scan input and build symbol stats */
+    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+        FORWARD_IF_ERROR(largest);
+        if (largest == srcSize) { hufMetadata->hType = set_rle; return 0; }
+        if (largest <= (srcSize >> 7)+4) { hufMetadata->hType = set_basic; return 0; }
+    }
+
+
+    /* Build Huffman Tree */
+    memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                    maxSymbolValue, huffLog,
+                                                    nodeWksp, nodeWkspSize);
+        FORWARD_IF_ERROR(maxBits);
+        huffLog = (U32)maxBits;
+        {   size_t cSize = HUF_estimateCompressedSize(
+                              (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+            size_t hSize = HUF_writeCTable(
+                              hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                              (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog);
+            if (cSize + hSize >= srcSize) { hufMetadata->hType = set_basic; return 0; }
+            hufMetadata->hType = set_compressed;
+            return hSize;
+        }
+    }
+}
+
+/** ZSTD_buildSuperBlockEntropy_sequences() :
+ *  Builds entropy for the super-block sequences.
+ *  Stores symbol compression modes and fse table to fseMetadata.
+ *  @return : size of fse tables or error code */
+static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr,
+                                              const ZSTD_fseCTables_t* prevEntropy,
+                                                    ZSTD_fseCTables_t* nextEntropy,
+                                              const ZSTD_CCtx_params* cctxParams,
+                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                    void* workspace, size_t wkspSize)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned);
+    BYTE* const cTableWksp = countWkspStart + countWkspSize;
+    const size_t cTableWkspSize = wkspEnd-cTableWksp;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* const ostart = fseMetadata->fseTablesBuffer;
+    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+    BYTE* op = ostart;
+
+    assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE));
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq);
+    memset(workspace, 0, wkspSize);
+
+    fseMetadata->lastCountSize = 0;
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    /* build CTable for Literal Lengths */
+    {   U32 LLtype;
+        unsigned max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        LLFSELog, prevEntropy->litlengthCTable,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+                                                    countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize);
+            if (LLtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->llType = (symbolEncodingType_e) LLtype;
+    }   }
+    /* build CTable for Offsets */
+    {   U32 Offtype;
+        unsigned max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        OffFSELog, prevEntropy->offcodeCTable,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+                                                    countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize);
+            if (Offtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->ofType = (symbolEncodingType_e) Offtype;
+    }   }
+    /* build CTable for MatchLengths */
+    {   U32 MLtype;
+        unsigned max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        MLFSELog, prevEntropy->matchlengthCTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+                                                    countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize);
+            if (MLtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->mlType = (symbolEncodingType_e) MLtype;
+    }   }
+    assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer));
+    return op-ostart;
+}
+
+
+/** ZSTD_buildSuperBlockEntropy() :
+ *  Builds entropy for the super-block.
+ *  @return : 0 on success or error code */
+static size_t
+ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
+                      const ZSTD_entropyCTables_t* prevEntropy,
+                            ZSTD_entropyCTables_t* nextEntropy,
+                      const ZSTD_CCtx_params* cctxParams,
+                            ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            void* workspace, size_t wkspSize)
+{
+    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy");
+    entropyMetadata->hufMetadata.hufDesSize =
+        ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize,
+                                            &prevEntropy->huf, &nextEntropy->huf,
+                                            &entropyMetadata->hufMetadata,
+                                            workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize);
+    entropyMetadata->fseMetadata.fseTablesSize =
+        ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                              cctxParams,
+                                              &entropyMetadata->fseMetadata,
+                                              workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize);
+    return 0;
+}
+
+/** ZSTD_compressSubBlock_literal() :
+ *  Compresses literals section for a sub-block.
+ *  Compressed literal size needs to be less than uncompressed literal size.
+ *      ZSTD spec doesn't have this constaint. I will explain why I have this constraint here.
+ *      Literals section header size ranges from 1 to 5 bytes,
+ *      which is dictated by regenerated size and compressed size.
+ *      In order to figure out the memory address to start writing compressed literal,
+ *      it is necessary to figure out the literals section header size.
+ *      The challenge is that compressed size is only known after compression.
+ *      This is a chicken and egg problem.
+ *      I am simplifying the problem by assuming that
+ *      compressed size will always be less than or equal to regenerated size,
+ *      and using regenerated size to calculate literals section header size.
+ *  hufMetadata->hType has literals block type info.
+ *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ *  @return : compressed size of literals section of a sub-block
+ *            Or 0 if it unable to compress.
+ *            Or error code */
+static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                    const BYTE* literals, size_t litSize,
+                                    void* dst, size_t dstSize,
+                                    const int bmi2, int writeEntropy)
+{
+    size_t const lhSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB);
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart + lhSize;
+    U32 singleStream = litSize < 256;
+    symbolEncodingType_e hType = writeEntropy ? set_compressed : set_repeat;
+    size_t cLitSize = 0;
+
+    (void)bmi2; // TODO bmi2...
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+    if (writeEntropy && litSize == 0) {
+      /* Literals section cannot be compressed mode when litSize == 0.
+       * (This seems to be decoder constraint.)
+       * Entropy cannot be written if literals section is not compressed mode.
+       */
+      return 0;
+    }
+
+    if (litSize == 0 || hufMetadata->hType == set_basic) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+      return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+    } else if (hufMetadata->hType == set_rle) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+      return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+    }
+
+    if (lhSize == 3) singleStream = 1;
+    if (writeEntropy) {
+        memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+        op += hufMetadata->hufDesSize;
+        cLitSize += hufMetadata->hufDesSize;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+    }
+
+    // TODO bmi2
+    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+        op += cSize;
+        cLitSize += cSize;
+        if (cSize == 0 || ERR_isError(cSize)) {
+          return 0;
+        }
+        if (cLitSize > litSize) {
+            if (writeEntropy) return 0;
+            else return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    return op-ostart;
+}
+
+static size_t ZSTD_seqDecompressedSize(const seqDef* sequences, size_t nbSeq, size_t litSize) {
+    const seqDef* const sstart = sequences;
+    const seqDef* const send = sequences + nbSeq;
+    const seqDef* sp = sstart;
+    size_t matchLengthSum = 0;
+    while (send-sp > 0) {
+      matchLengthSum += sp->matchLength + MINMATCH;
+      sp++;
+    }
+    return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ *  Compresses sequences section for a sub-block.
+ *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ *  symbol compression modes for the super-block.
+ *  First sub-block will have these in its header. The following sub-blocks
+ *  will always have repeat mode.
+ *  @return : compressed size of sequences section of a sub-block
+ *            Or 0 if it is unable to compress
+ *            Or error code. */
+static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                              const seqDef* sequences, size_t nbSeq,
+                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                              const ZSTD_CCtx_params* cctxParams,
+                                              void* dst, size_t dstCapacity,
+                                              const int bmi2, int writeEntropy)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    BYTE* seqHead;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall);
+    if (nbSeq < 0x7F)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (writeEntropy && nbSeq == 0) {
+        return 0;
+    }
+    if (nbSeq==0) {
+        return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+    if (writeEntropy) {
+        const U32 LLtype = fseMetadata->llType;
+        const U32 Offtype = fseMetadata->ofType;
+        const U32 MLtype = fseMetadata->mlType;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+        *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+        memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+        op += fseMetadata->fseTablesSize;
+    } else {
+        const U32 repeat = set_repeat;
+        *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        fseTables->matchlengthCTable, mlCode,
+                                        fseTables->offcodeCTable, ofCode,
+                                        fseTables->litlengthCTable, llCode,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize);
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+    }
+
+    /* zstd versions <= 1.4.0 mistakenly report error when
+     * sequences section body size is less than 3 bytes.
+     * Fixed by https://github.com/facebook/zstd/pull/1664.
+     * This can happen when the previous sequences section block is compressed
+     * with rle mode and the current block's sequences section is compressed
+     * with repeat mode where sequences section body size can be 1 byte.
+     */
+    if (op-seqHead < 4) {
+        return 0;
+    }
+
+    return op - ostart;
+}
+
+/** ZSTD_compressSubBlock() :
+ *  Compresses a single sub-block.
+ *  @return : compressed size of the sub-block
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                    const seqDef* sequences, size_t nbSeq,
+                                    const BYTE* literals, size_t litSize,
+                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                    const ZSTD_CCtx_params* cctxParams,
+                                    void* dst, size_t dstCapacity,
+                                    const int bmi2, int writeEntropy, U32 lastBlock)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart + ZSTD_blockHeaderSize;
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeEntropy, lastBlock);
+    {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                        &entropyMetadata->hufMetadata, literals, litSize,
+                                                        op, oend-op, bmi2, writeEntropy);
+        FORWARD_IF_ERROR(cLitSize);
+        if (cLitSize == 0) return 0;
+        op += cLitSize;
+    }
+    {   size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+                                                  &entropyMetadata->fseMetadata,
+                                                  sequences, nbSeq,
+                                                  llCode, mlCode, ofCode,
+                                                  cctxParams,
+                                                  op, oend-op,
+                                                  bmi2, writeEntropy);
+        FORWARD_IF_ERROR(cSeqSize);
+        if (cSeqSize == 0) return 0;
+        op += cSeqSize;
+    }
+    /* Write block header */
+    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+        U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+        MEM_writeLE24(ostart, cBlockHeader24);
+    }
+    return op-ostart;
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = 255;
+    size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, unsigned maxCode,
+                        size_t nbSeq, const FSE_CTable* fseCTable,
+                        const U32* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+    size_t cSeqSizeEstimate = 0;
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+                                         nbSeq, fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+                                         nbSeq, fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                        const BYTE* ofCodeTable,
+                                        const BYTE* llCodeTable,
+                                        const BYTE* mlCodeTable,
+                                        size_t nbSeq,
+                                        const ZSTD_entropyCTables_t* entropy,
+                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                        void* workspace, size_t wkspSize,
+                                        int writeEntropy) {
+    size_t cSizeEstimate = 0;
+    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+                                                         workspace, wkspSize, writeEntropy);
+    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeEntropy);
+    return cSizeEstimate + ZSTD_blockHeaderSize;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ *  Breaks super-block into multiple sub-blocks and compresses them.
+ *  Entropy will be written to the first block.
+ *  The following blocks will use repeat mode to compress.
+ *  All sub-blocks are compressed blocks (no raw or rle blocks).
+ *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                            const ZSTD_entropyCTables_t* entropy,
+                            const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            const ZSTD_CCtx_params* cctxParams,
+                                  void* dst, size_t dstCapacity,
+                            const int bmi2, U32 lastBlock,
+                            void* workspace, size_t wkspSize)
+{
+    const seqDef* const sstart = seqStorePtr->sequencesStart;
+    const seqDef* const send = seqStorePtr->sequences;
+    const seqDef* sp = sstart;
+    const BYTE* const lstart = seqStorePtr->litStart;
+    const BYTE* const lend = seqStorePtr->lit;
+    const BYTE* lp = lstart;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    const BYTE* llCodePtr = seqStorePtr->llCode;
+    const BYTE* mlCodePtr = seqStorePtr->mlCode;
+    const BYTE* ofCodePtr = seqStorePtr->ofCode;
+    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+    size_t litSize, seqCount;
+    int writeEntropy = 1;
+    size_t remaining = ZSTD_seqDecompressedSize(sstart, send-sstart, lend-lstart);
+    size_t cBlockSizeEstimate = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+                (unsigned)(lend-lp), (unsigned)(send-sstart));
+
+    litSize = 0;
+    seqCount = 0;
+    while (sp + seqCount < send) {
+        const seqDef* const sequence = sp + seqCount;
+        const U32 lastSequence = sequence+1 == send;
+        litSize = (sequence == send) ? (size_t)(lend-lp) : litSize + sequence->litLength;
+        seqCount++;
+        /* I think there is an optimization opportunity here.
+         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+         * since it recalculates estimate from scratch.
+         * For example, it would recount literal distribution and symbol codes everytime. 
+         */
+        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+                                                       entropy, entropyMetadata,
+                                                       workspace, wkspSize, writeEntropy);
+        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+            const size_t decompressedSize = ZSTD_seqDecompressedSize(sp, seqCount, litSize);
+            const size_t cSize = ZSTD_compressSubBlock(entropy, entropyMetadata,
+                                                       sp, seqCount,
+                                                       lp, litSize,
+                                                       llCodePtr, mlCodePtr, ofCodePtr,
+                                                       cctxParams,
+                                                       op, oend-op,
+                                                       bmi2, writeEntropy, lastBlock && lastSequence);
+            FORWARD_IF_ERROR(cSize);
+            if (cSize > 0 && cSize < decompressedSize) {
+                assert(remaining >= decompressedSize);
+                remaining -= decompressedSize;
+                sp += seqCount;
+                lp += litSize;
+                op += cSize;
+                llCodePtr += seqCount;
+                mlCodePtr += seqCount;
+                ofCodePtr += seqCount;
+                litSize = 0;
+                seqCount = 0;
+                writeEntropy = 0; // Entropy only needs to be written once
+            }
+        }
+    }
+    if (remaining) {
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi failed to compress");
+        return 0;
+    }
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+    return op-ostart;
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               unsigned lastBlock) {
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+    FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore,
+          &zc->blockState.prevCBlock->entropy,
+          &zc->blockState.nextCBlock->entropy,
+          &zc->appliedParams,
+          &entropyMetadata,
+          zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */));
+
+    return ZSTD_compressSubBlock_multi(&zc->seqStore,
+            &zc->blockState.nextCBlock->entropy,
+            &entropyMetadata,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            zc->bmi2, lastBlock,
+            zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
+
+size_t ZSTD_noCompressSuperBlock(void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 size_t targetCBlockSize,
+                                 unsigned lastBlock) {
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    DEBUGLOG(5, "ZSTD_noCompressSuperBlock (dstCapacity=%zu, srcSize=%zu, targetCBlockSize=%zu)",
+                dstCapacity, srcSize, targetCBlockSize);
+    while (ip < iend) {
+        size_t remaining = iend-ip;
+        unsigned lastSubBlock = remaining <= targetCBlockSize;
+        size_t blockSize = lastSubBlock ? remaining : targetCBlockSize;
+        size_t cSize = ZSTD_noCompressBlock(op, oend-op, ip, blockSize, lastSubBlock && lastBlock);
+        FORWARD_IF_ERROR(cSize);
+        ip += blockSize;
+        op += cSize;
+    }
+    return op-ostart;
+}
--- a/lib/compress/zstd_compress_superblock.h
+++ b/lib/compress/zstd_compress_superblock.h
@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_ADVANCED_H
+#define ZSTD_COMPRESS_ADVANCED_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+
+#include "zstd.h" /* ZSTD_CCtx */
+
+/*-*************************************
+*  Target Compressed Block Size
+***************************************/
+
+/* ZSTD_compressSuperBlock() :
+ * Used to compress a super block when targetCBlockSize is being used.
+ * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               unsigned lastBlock);
+
+/* ZSTD_noCompressSuperBlock() :
+* Used to break a super block into multiple uncompressed sub blocks
+* when targetCBlockSize is being used.
+* The given block will be broken into multiple uncompressed sub blocks that are
+* around targetCBlockSize. */
+size_t ZSTD_noCompressSuperBlock(void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 size_t targetCBlockSize,
+                                 unsigned lastBlock);
+
+#endif /* ZSTD_COMPRESS_ADVANCED_H */
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@ -96,7 +96,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
                                     dictCParams->hashLog : hBitsL;
    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ?
                                     dictCParams->chainLog : hBitsS;
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixLowest + dictEnd - dictStart);
+    const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));

    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic");

@ -271,7 +271,7 @@ _match_stored:
                    U32 const repIndex2 = current2 - offset_2;
                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
                        && repIndex2 < prefixLowestIndex ?
-                            dictBase - dictIndexDelta + repIndex2 :
+                            dictBase + repIndex2 - dictIndexDelta :
                            base + repIndex2;
                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@ -660,7 +660,7 @@ ZSTD_compressBlock_lazy_generic(
    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
                                     0;
-    const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest);
+    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));

    /* init */
    ip += (dictAndPrefixLength == 0);
--- a/lib/compress/zstdmt_compress.c
+++ b/lib/compress/zstdmt_compress.c
@ -490,7 +490,7 @@ static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool*
        /* Size the seq pool tables */
        ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize));
        /* Reset the window */
-        ZSTD_window_clear(&serialState->ldmState.window);
+        ZSTD_window_init(&serialState->ldmState.window);
        serialState->ldmWindow = serialState->ldmState.window;
        /* Resize tables and output space if necessary. */
        if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) {
@ -1076,7 +1076,7 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_p
    DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)",
                compressionLevel);
    mtctx->params.compressionLevel = compressionLevel;
-    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, 0, 0);
+    {   ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0);
        cParams.windowLog = saved_wlog;
        mtctx->params.cParams = cParams;
    }
@ -1786,7 +1786,7 @@ static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
    BYTE const* const bufferStart = (BYTE const*)buffer.start;
    BYTE const* const bufferEnd = bufferStart + buffer.capacity;
    BYTE const* const rangeStart = (BYTE const*)range.start;
-    BYTE const* const rangeEnd = rangeStart + range.size;
+    BYTE const* const rangeEnd = range.size != 0 ? rangeStart + range.size : rangeStart;

    if (rangeStart == NULL || bufferStart == NULL)
        return 0;
--- a/lib/decompress/huf_decompress.c
+++ b/lib/decompress/huf_decompress.c
@ -181,17 +181,29 @@ size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize

    /* fill DTable */
    {   U32 n;
-        for (n=0; n<nbSymbols; n++) {
-            U32 const w = huffWeight[n];
-            U32 const length = (1 << w) >> 1;
-            U32 u;
+        size_t const nEnd = nbSymbols;
+        for (n=0; n<nEnd; n++) {
+            size_t const w = huffWeight[n];
+            size_t const length = (1 << w) >> 1;
+            size_t const uStart = rankVal[w];
+            size_t const uEnd = uStart + length;
+            size_t u;
            HUF_DEltX1 D;
-            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
-            for (u = rankVal[w]; u < rankVal[w] + length; u++)
-                dt[u] = D;
-            rankVal[w] += length;
-    }   }
-
+            D.byte = (BYTE)n;
+            D.nbBits = (BYTE)(tableLog + 1 - w);
+            rankVal[w] = (U32)uEnd;
+            if (length < 4) {
+                /* Use length in the loop bound so the compiler knows it is short. */
+                for (u = 0; u < length; ++u)
+                    dt[uStart + u] = D;
+            } else {
+                /* Unroll the loop 4 times, we know it is a power of 2. */
+                for (u = uStart; u < uEnd; u += 4) {
+                    dt[u + 0] = D;
+                    dt[u + 1] = D;
+                    dt[u + 2] = D;
+                    dt[u + 3] = D;
+    }   }   }   }
    return iSize;
 }

@ -282,6 +294,7 @@ HUF_decompress4X1_usingDTable_internal_body(
    {   const BYTE* const istart = (const BYTE*) cSrc;
        BYTE* const ostart = (BYTE*) dst;
        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - 3;
        const void* const dtPtr = DTable + 1;
        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;

@ -306,9 +319,9 @@ HUF_decompress4X1_usingDTable_internal_body(
        BYTE* op2 = opStart2;
        BYTE* op3 = opStart3;
        BYTE* op4 = opStart4;
-        U32 endSignal = BIT_DStream_unfinished;
        DTableDesc const dtd = HUF_getDTableDesc(DTable);
        U32 const dtLog = dtd.tableLog;
+        U32 endSignal = 1;

        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
@ -317,8 +330,7 @@ HUF_decompress4X1_usingDTable_internal_body(
        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );

        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
-        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
+        for ( ; (endSignal) & (op4 < olimit) ; ) {
            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
@ -335,10 +347,10 @@ HUF_decompress4X1_usingDTable_internal_body(
            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
-            BIT_reloadDStream(&bitD1);
-            BIT_reloadDStream(&bitD2);
-            BIT_reloadDStream(&bitD3);
-            BIT_reloadDStream(&bitD4);
+            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
        }

        /* check corruption */
@ -757,7 +769,6 @@ HUF_decompress1X2_usingDTable_internal_body(
    return dstSize;
 }

-
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
          void* dst,  size_t dstSize,
@ -769,6 +780,7 @@ HUF_decompress4X2_usingDTable_internal_body(
    {   const BYTE* const istart = (const BYTE*) cSrc;
        BYTE* const ostart = (BYTE*) dst;
        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - (sizeof(size_t)-1);
        const void* const dtPtr = DTable+1;
        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;

@ -793,7 +805,7 @@ HUF_decompress4X2_usingDTable_internal_body(
        BYTE* op2 = opStart2;
        BYTE* op3 = opStart3;
        BYTE* op4 = opStart4;
-        U32 endSignal;
+        U32 endSignal = 1;
        DTableDesc const dtd = HUF_getDTableDesc(DTable);
        U32 const dtLog = dtd.tableLog;

@ -804,8 +816,29 @@ HUF_decompress4X2_usingDTable_internal_body(
        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );

        /* 16-32 symbols per loop (4-8 symbols per stream) */
-        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
-        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+        for ( ; (endSignal) & (op4 < olimit); ) {
+#ifdef __clang__
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#else
            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
@ -822,8 +855,11 @@ HUF_decompress4X2_usingDTable_internal_body(
            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-
-            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#endif
        }

        /* check corruption */
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@ -618,7 +618,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
 {
    const BYTE* ip = (const BYTE*)(*srcPtr);
    BYTE* const ostart = (BYTE* const)dst;
-    BYTE* const oend = ostart + dstCapacity;
+    BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
    BYTE* op = ostart;
    size_t remainingSrcSize = *srcSizePtr;

@ -669,7 +669,9 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
        if (ZSTD_isError(decodedSize)) return decodedSize;
        if (dctx->fParams.checksumFlag)
            XXH64_update(&dctx->xxhState, op, decodedSize);
+        if (decodedSize != 0)
            op += decodedSize;
+        assert(ip != NULL);
        ip += cBlockSize;
        remainingSrcSize -= cBlockSize;
        if (blockProperties.lastBlock) break;
@ -776,6 +778,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                "error.");
            if (ZSTD_isError(res)) return res;
            assert(res <= dstCapacity);
+            if (res != 0)
                dst = (BYTE*)dst + res;
            dstCapacity -= res;
        }
@ -1486,11 +1489,13 @@ MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src,

 size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
 {
-    const char* const istart = (const char*)(input->src) + input->pos;
-    const char* const iend = (const char*)(input->src) + input->size;
+    const char* const src = (const char*)input->src;
+    const char* const istart = input->pos != 0 ? src + input->pos : src;
+    const char* const iend = input->size != 0 ? src + input->size : src;
    const char* ip = istart;
-    char* const ostart = (char*)(output->dst) + output->pos;
-    char* const oend = (char*)(output->dst) + output->size;
+    char* const dst = (char*)output->dst;
+    char* const ostart = output->pos != 0 ? dst + output->pos : dst;
+    char* const oend = output->size != 0 ? dst + output->size : dst;
    char* op = ostart;
    U32 someMoreWork = 1;

--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@ -715,7 +715,7 @@ size_t ZSTD_execSequence(BYTE* op,

    /* Errors and uncommon cases handled here. */
    assert(oLitEnd < oMatchEnd);
-    if (iLitEnd > litLimit || oMatchEnd > oend_w)
+    if (UNLIKELY(iLitEnd > litLimit || oMatchEnd > oend_w))
        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);

    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
@ -729,7 +729,7 @@ size_t ZSTD_execSequence(BYTE* op,
     */
    assert(WILDCOPY_OVERLENGTH >= 16);
    ZSTD_copy16(op, (*litPtr));
-    if (sequence.litLength > 16) {
+    if (UNLIKELY(sequence.litLength > 16)) {
        ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
    }
    op = oLitEnd;
@ -738,7 +738,7 @@ size_t ZSTD_execSequence(BYTE* op,
    /* Copy Match */
    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
        /* offset beyond prefix -> go into extDict */
-        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected);
        match = dictEnd + (match - prefixStart);
        if (match + sequence.matchLength <= dictEnd) {
            memmove(oLitEnd, match, sequence.matchLength);
@ -760,7 +760,7 @@ size_t ZSTD_execSequence(BYTE* op,
    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
     * without overlap checking.
     */
-    if (sequence.offset >= WILDCOPY_VECLEN) {
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
        /* We bet on a full wildcopy for matches, since we expect matches to be
         * longer than literals (in general). In silesia, ~10% of matches are longer
         * than 16 bytes.
@ -802,6 +802,14 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
    DStatePtr->state = DInfo.nextState + lowBits;
 }

+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
+{
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.nextState + lowBits;
+}
+
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
 * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
 * bits before reloading. This value is the maximum number of bytes we read
@ -813,25 +821,26 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
        : 0)

 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;

-#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
 FORCE_INLINE_TEMPLATE seq_t
-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
 {
    seq_t seq;
-    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
-    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
-    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
-    U32 const totalBits = llBits+mlBits+ofBits;
-    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
-    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
-    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
+    ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
+    ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
+    ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
+    U32 const llBase = llDInfo.baseValue;
+    U32 const mlBase = mlDInfo.baseValue;
+    U32 const ofBase = ofDInfo.baseValue;
+    BYTE const llBits = llDInfo.nbAdditionalBits;
+    BYTE const mlBits = mlDInfo.nbAdditionalBits;
+    BYTE const ofBits = ofDInfo.nbAdditionalBits;
+    BYTE const totalBits = llBits+mlBits+ofBits;

    /* sequence */
    {   size_t offset;
-        if (!ofBits)
-            offset = 0;
-        else {
+        if (ofBits > 1) {
            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
            assert(ofBits <= MaxOff);
@ -845,53 +854,89 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
            }
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        } else {
+            U32 const ll0 = (llBase == 0);
+            if (LIKELY((ofBits == 0))) {
+                if (LIKELY(!ll0))
+                    offset = seqState->prevOffset[0];
+                else {
+                    offset = seqState->prevOffset[1];
+                    seqState->prevOffset[1] = seqState->prevOffset[0];
+                    seqState->prevOffset[0] = offset;
                }
-
-        if (ofBits <= 1) {
-            offset += (llBase==0);
-            if (offset) {
-                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+            } else {
+                offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
                    temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
                    if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
                    seqState->prevOffset[1] = seqState->prevOffset[0];
                    seqState->prevOffset[0] = offset = temp;
-            } else {  /* offset == 0 */
-                offset = seqState->prevOffset[0];
-            }
-        } else {
-            seqState->prevOffset[2] = seqState->prevOffset[1];
-            seqState->prevOffset[1] = seqState->prevOffset[0];
-            seqState->prevOffset[0] = offset;
-        }
+        }   }   }
        seq.offset = offset;
    }

-    seq.matchLength = mlBase
-                    + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0);  /* <=  16 bits */
+    seq.matchLength = mlBase;
+    if (mlBits > 0)
+        seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
        BIT_reloadDStream(&seqState->DStream);
-    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+    if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
        BIT_reloadDStream(&seqState->DStream);
    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);

-    seq.litLength = llBase
-                  + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0);    /* <=  16 bits */
+    seq.litLength = llBase;
+    if (llBits > 0)
+        seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
    if (MEM_32bits())
        BIT_reloadDStream(&seqState->DStream);

    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);

-    /* ANS state update */
+    if (prefetch == ZSTD_p_prefetch) {
+        size_t const pos = seqState->pos + seq.litLength;
+        const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
+        seq.match = matchBase + pos - seq.offset;  /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+                                                    * No consequence though : no memory access will occur, offset is only used for prefetching */
+        seqState->pos = pos + seq.matchLength;
+    }
+
+    /* ANS state update
+     * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
+     * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
+     * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
+     * better option, so it is the default for other compilers. But, if you
+     * measure that it is worse, please put up a pull request.
+     */
+    {
+#if defined(__GNUC__) && !defined(__clang__)
+        const int kUseUpdateFseState = 1;
+#else
+        const int kUseUpdateFseState = 0;
+#endif
+        if (kUseUpdateFseState) {
            ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
            ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
            ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+        } else {
+            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo);    /* <=  9 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo);  /* <=  8 bits */
+        }
+    }

    return seq;
 }

+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
 FORCE_INLINE_TEMPLATE size_t
 DONT_VECTORIZE
 ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
@ -914,6 +959,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
    /* Regen sequences */
    if (nbSeq) {
        seqState_t seqState;
+        size_t error = 0;
        dctx->fseEntropy = 1;
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
        RETURN_ERROR_IF(
@ -928,17 +974,25 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
                BIT_DStream_endOfBuffer < BIT_DStream_completed &&
                BIT_DStream_completed < BIT_DStream_overflow);

-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
-            nbSeq--;
-            {   seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+        for ( ; ; ) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
            DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
-                op += oneSeqSize;
-        }   }
+            BIT_reloadDStream(&(seqState.DStream));
+            /* gcc and clang both don't like early returns in this loop.
+             * gcc doesn't like early breaks either.
+             * Instead save an error and report it at the end.
+             * When there is an error, don't increment op, so we don't
+             * overwrite.
+             */
+            if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
+            else op += oneSeqSize;
+            if (UNLIKELY(!--nbSeq)) break;
+        }

        /* check if reached exact end */
        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+        if (ZSTD_isError(error)) return error;
        RETURN_ERROR_IF(nbSeq, corruption_detected);
        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
        /* save reps for next block */
@ -965,87 +1019,7 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */

-
-
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-FORCE_INLINE_TEMPLATE seq_t
-ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
-{
-    seq_t seq;
-    U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
-    U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
-    U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
-    U32 const totalBits = llBits+mlBits+ofBits;
-    U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
-    U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
-    U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
-
-    /* sequence */
-    {   size_t offset;
-        if (!ofBits)
-            offset = 0;
-        else {
-            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
-            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-            assert(ofBits <= MaxOff);
-            if (MEM_32bits() && longOffsets) {
-                U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
-                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
-                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
-                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-            } else {
-                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
-                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
-            }
-        }
-
-        if (ofBits <= 1) {
-            offset += (llBase==0);
-            if (offset) {
-                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
-                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
-                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
-                seqState->prevOffset[1] = seqState->prevOffset[0];
-                seqState->prevOffset[0] = offset = temp;
-            } else {
-                offset = seqState->prevOffset[0];
-            }
-        } else {
-            seqState->prevOffset[2] = seqState->prevOffset[1];
-            seqState->prevOffset[1] = seqState->prevOffset[0];
-            seqState->prevOffset[0] = offset;
-        }
-        seq.offset = offset;
-    }
-
-    seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
-    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
-        BIT_reloadDStream(&seqState->DStream);
-    if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
-        BIT_reloadDStream(&seqState->DStream);
-    /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
-    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
-
-    seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
-    if (MEM_32bits())
-        BIT_reloadDStream(&seqState->DStream);
-
-    {   size_t const pos = seqState->pos + seq.litLength;
-        const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
-        seq.match = matchBase + pos - seq.offset;  /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
-                                                    * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
-        seqState->pos = pos + seq.matchLength;
-    }
-
-    /* ANS state update */
-    ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
-    ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
-    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
-    ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
-
-    return seq;
-}
-
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_decompressSequencesLong_body(
                               ZSTD_DCtx* dctx,
@ -1088,14 +1062,14 @@ ZSTD_decompressSequencesLong_body(

        /* prepare in advance */
        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
-            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
        }
        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);

        /* decode and decompress */
        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
-            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
            PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
--- a/lib/decompress/zstd_decompress_internal.h
+++ b/lib/decompress/zstd_decompress_internal.h
@ -160,7 +160,7 @@ struct ZSTD_DCtx_s

 /*! ZSTD_loadDEntropy() :
 *  dict : must point at beginning of a valid zstd dictionary.
- * @return : size of entropy tables read */
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
 size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
                   const void* const dict, size_t const dictSize);

--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@ -48,6 +48,7 @@
 #  define ZDICT_STATIC_LINKING_ONLY
 #endif
 #include "zdict.h"
+#include "compress/zstd_compress_internal.h" /* ZSTD_loadCEntropy() */


 /*-*************************************
@ -99,6 +100,29 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
    return MEM_readLE32((const char*)dictBuffer + 4);
 }

+size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
+{
+    size_t headerSize;
+    if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
+
+    {   unsigned offcodeMaxValue = MaxOff;
+        ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
+        U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
+        short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
+        if (!bs || !wksp || !offcodeNCount) {
+            headerSize = ERROR(memory_allocation);
+        } else {
+            ZSTD_reset_compressedBlockState(bs);
+            headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
+        }
+
+        free(bs);
+        free(wksp);
+        free(offcodeNCount);
+    }
+
+    return headerSize;
+}

 /*-********************************************************
 *  Dictionary training functions
@ -588,12 +612,12 @@ typedef struct

 #define MAXREPOFFSET 1024

-static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
+static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params,
                              unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets,
                              const void* src, size_t srcSize,
                              U32 notificationLevel)
 {
-    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
+    size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog);
    size_t cSize;

    if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
@ -731,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,

    /* collect stats on all samples */
    for (u=0; u<nbFiles; u++) {
-        ZDICT_countEStats(esr, params,
+        ZDICT_countEStats(esr, &params,
                          countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
                         (const char*)srcBuffer + pos, fileSizes[u],
                          notificationLevel);
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@ -64,6 +64,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap

 /*======   Helper functions   ======*/
 ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);  /**< extracts dictID; @return zero if error (not a valid dictionary) */
+ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize);  /* returns dict header size; returns a ZSTD error code on failure */
 ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
 ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);

--- a/lib/zstd.h
+++ b/lib/zstd.h
@ -763,7 +763,7 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);

 /* This function is redundant with the advanced API and equivalent to:
 *
- *     ZSTD_DCtx_reset(zds);
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
 *     ZSTD_DCtx_refDDict(zds, NULL);
 */
 ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
--- a/programs/.gitignore
+++ b/programs/.gitignore
@ -7,6 +7,7 @@ zstd-decompress
 zstd-frugal
 zstd-small
 zstd-nolegacy
+zstd-dictBuilder

 # Object files
 *.o
--- a/programs/Makefile
+++ b/programs/Makefile
@ -158,7 +158,7 @@ default: zstd-release
 all: zstd

 .PHONY: allVariants
-allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy
+allVariants: zstd zstd-compress zstd-decompress zstd-small zstd-nolegacy zstd-dictBuilder

 $(ZSTDDECOMP_O): CFLAGS += $(ALIGN_LOOP)

@ -231,6 +231,10 @@ zstd-decompress: $(ZSTDCOMMON_FILES) $(ZSTDDECOMP_FILES) zstdcli.c util.c timefn
 zstd-compress: $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES) zstdcli.c util.c timefn.c fileio.c
 	$(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT -DZSTD_NODECOMPRESS $^ -o $@$(EXT)

+zstd-dictBuilder: CPPFLAGS += -DZSTD_NOBENCH -DZSTD_NODECOMPRESS
+zstd-dictBuilder: $(ZSTDCOMMON_FILES) $(ZSTDCOMP_FILES) $(ZDICT_FILES) zstdcli.c util.c timefn.c fileio.c dibio.c
+	$(CC) $(FLAGS) $^ -o $@$(EXT)
+
 zstdmt: zstd
 	ln -sf zstd zstdmt

@ -245,7 +249,7 @@ clean:
 	@$(RM) core *.o tmp* result* *.gcda dictionary *.zst \
        zstd$(EXT) zstd32$(EXT) zstd-compress$(EXT) zstd-decompress$(EXT) \
        zstd-small$(EXT) zstd-frugal$(EXT) zstd-nolegacy$(EXT) zstd4$(EXT) \
-        *.gcda default.profraw have_zlib$(EXT)
+        zstd-dictBuilder$(EXT) *.gcda default.profraw have_zlib$(EXT)
 	@echo Cleaning completed

 MD2ROFF = ronn
--- a/programs/fileio.c
+++ b/programs/fileio.c
@ -502,7 +502,7 @@ static int FIO_remove(const char* path)
 #if defined(_WIN32) || defined(WIN32)
    /* windows doesn't allow remove read-only files,
     * so try to make it writable first */
-    chmod(path, _S_IWRITE);
+    UTIL_chmod(path, _S_IWRITE);
 #endif
    return remove(path);
 }
@ -526,9 +526,7 @@ static FILE* FIO_openSrcFile(const char* srcFileName)
    }

    if (!UTIL_isRegularFile(srcFileName)
-#ifndef _MSC_VER
     && !UTIL_isFIFO(srcFileName)
-#endif /* _MSC_VER */
    ) {
        DISPLAYLEVEL(1, "zstd: %s is not a regular file -- ignored \n",
                        srcFileName);
@ -609,8 +607,11 @@ FIO_openDstFile(FIO_prefs_t* const prefs,
    {   FILE* const f = fopen( dstFileName, "wb" );
        if (f == NULL) {
            DISPLAYLEVEL(1, "zstd: %s: %s\n", dstFileName, strerror(errno));
-        } else if(srcFileName != NULL && strcmp (srcFileName, stdinmark)) {
-            chmod(dstFileName, 00600);
+        } else if (srcFileName != NULL
+               && strcmp (srcFileName, stdinmark)
+               && strcmp(dstFileName, nulmark) ) {
+            /* reduce rights on newly created dst file while compression is ongoing */
+            UTIL_chmod(dstFileName, 00600);
        }
        return f;
    }
@ -1393,7 +1394,7 @@ static int FIO_compressFilename_dstFile(FIO_prefs_t* const prefs,
    assert(ress.srcFile != NULL);
    if (ress.dstFile == NULL) {
        closeDstFile = 1;
-        DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s", dstFileName);
+        DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: opening dst: %s \n", dstFileName);
        ress.dstFile = FIO_openDstFile(prefs, srcFileName, dstFileName);
        if (ress.dstFile==NULL) return 1;  /* could not open dstFileName */
        /* Must only be added after FIO_openDstFile() succeeds.
@ -1415,6 +1416,7 @@ static int FIO_compressFilename_dstFile(FIO_prefs_t* const prefs,

        clearHandler();

+        DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: closing dst: %s \n", dstFileName);
        if (fclose(dstFile)) { /* error closing dstFile */
            DISPLAYLEVEL(1, "zstd: %s: %s \n", dstFileName, strerror(errno));
            result=1;
@ -1427,7 +1429,10 @@ static int FIO_compressFilename_dstFile(FIO_prefs_t* const prefs,
        } else if ( strcmp(dstFileName, stdoutmark)
                 && strcmp(dstFileName, nulmark)
                 && transfer_permissions) {
+            DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: transfering permissions into dst: %s \n", dstFileName);
            UTIL_setFileStat(dstFileName, &statbuf);
+        } else {
+            DISPLAYLEVEL(6, "FIO_compressFilename_dstFile: do not transfer permissions into dst: %s \n", dstFileName);
        }
    }

@ -1462,6 +1467,7 @@ FIO_compressFilename_srcFile(FIO_prefs_t* const prefs,
                             int compressionLevel)
 {
    int result;
+    DISPLAYLEVEL(6, "FIO_compressFilename_srcFile: %s \n", srcFileName);

    /* ensure src is not a directory */
    if (UTIL_isDirectory(srcFileName)) {
--- a/programs/util.c
+++ b/programs/util.c
@ -44,7 +44,6 @@ extern "C" {
        exit(1);              \
 }   }

-
 /*
 * A modified version of realloc().
 * If UTIL_realloc() fails the original block is freed.
@ -57,6 +56,10 @@ UTIL_STATIC void* UTIL_realloc(void *ptr, size_t size)
    return NULL;
 }

+#if defined(_MSC_VER)
+    #define chmod _chmod
+#endif
+

 /*-****************************************
 *  Console log
@ -64,9 +67,16 @@ UTIL_STATIC void* UTIL_realloc(void *ptr, size_t size)
 int g_utilDisplayLevel;


-/*-****************************************
-*  Public API
-******************************************/
+/*-*************************************
+*  Constants
+***************************************/
+#define LIST_SIZE_INCREASE   (8*1024)
+#define MAX_FILE_OF_FILE_NAMES_SIZE (1<<20)*50
+
+
+/*-*************************************
+*  Functions
+***************************************/

 int UTIL_fileExist(const char* filename)
 {
@ -98,6 +108,13 @@ int UTIL_getFileStat(const char* infilename, stat_t *statbuf)
    return 1;
 }

+/* like chmod, but avoid changing permission of /dev/null */
+int UTIL_chmod(char const* filename, mode_t permissions)
+{
+    if (!strcmp(filename, "/dev/null")) return 0;   /* pretend success, but don't change anything */
+    return chmod(filename, permissions);
+}
+
 int UTIL_setFileStat(const char *filename, stat_t *statbuf)
 {
    int res = 0;
@ -117,7 +134,7 @@ int UTIL_setFileStat(const char *filename, stat_t *statbuf)
    {
        /* (atime, mtime) */
        struct timespec timebuf[2] = { {0, UTIME_NOW} };
-        timebuf[1] = statbuf->st_mtim;
+        timebuf[1].tv_sec = statbuf->st_mtime;
        res += utimensat(AT_FDCWD, filename, timebuf, 0);
    }
 #endif
@ -126,21 +143,20 @@ int UTIL_setFileStat(const char *filename, stat_t *statbuf)
    res += chown(filename, statbuf->st_uid, statbuf->st_gid);  /* Copy ownership */
 #endif

-    res += chmod(filename, statbuf->st_mode & 07777);  /* Copy file permissions */
+    res += UTIL_chmod(filename, statbuf->st_mode & 07777);  /* Copy file permissions */

    errno = 0;
    return -res; /* number of errors is returned */
 }

-U32 UTIL_isDirectory(const char* infilename)
+int UTIL_isDirectory(const char* infilename)
 {
-    int r;
    stat_t statbuf;
 #if defined(_MSC_VER)
-    r = _stat64(infilename, &statbuf);
+    int const r = _stat64(infilename, &statbuf);
    if (!r && (statbuf.st_mode & _S_IFDIR)) return 1;
 #else
-    r = stat(infilename, &statbuf);
+    int const r = stat(infilename, &statbuf);
    if (!r && S_ISDIR(statbuf.st_mode)) return 1;
 #endif
    return 0;
@ -170,28 +186,25 @@ int UTIL_isSameFile(const char* fName1, const char* fName2)
 #endif
 }

-#ifndef _MSC_VER
-/* Using this to distinguish named pipes */
-U32 UTIL_isFIFO(const char* infilename)
+/* UTIL_isFIFO : distinguish named pipes */
+int UTIL_isFIFO(const char* infilename)
 {
 /* macro guards, as defined in : https://linux.die.net/man/2/lstat */
 #if PLATFORM_POSIX_VERSION >= 200112L
    stat_t statbuf;
-    int r = UTIL_getFileStat(infilename, &statbuf);
+    int const r = UTIL_getFileStat(infilename, &statbuf);
    if (!r && S_ISFIFO(statbuf.st_mode)) return 1;
 #endif
    (void)infilename;
    return 0;
 }
-#endif

-U32 UTIL_isLink(const char* infilename)
+int UTIL_isLink(const char* infilename)
 {
 /* macro guards, as defined in : https://linux.die.net/man/2/lstat */
 #if PLATFORM_POSIX_VERSION >= 200112L
-    int r;
    stat_t statbuf;
-    r = lstat(infilename, &statbuf);
+    int const r = lstat(infilename, &statbuf);
    if (!r && S_ISLNK(statbuf.st_mode)) return 1;
 #endif
    (void)infilename;
--- a/programs/util.h
+++ b/programs/util.h
@ -85,12 +85,6 @@ extern "C" {
 #endif


-/*-*************************************
-*  Constants
-***************************************/
-#define LIST_SIZE_INCREASE   (8*1024)
-#define MAX_FILE_OF_FILE_NAMES_SIZE (1<<20)*50
-
 /*-****************************************
 *  Compiler specifics
 ******************************************/
@ -120,8 +114,8 @@ extern int g_utilDisplayLevel;
 *  File functions
 ******************************************/
 #if defined(_MSC_VER)
-    #define chmod _chmod
    typedef struct __stat64 stat_t;
+    typedef int mode_t;
 #else
    typedef struct stat stat_t;
 #endif
@ -129,22 +123,20 @@ extern int g_utilDisplayLevel;

 int UTIL_fileExist(const char* filename);
 int UTIL_isRegularFile(const char* infilename);
-int UTIL_setFileStat(const char* filename, stat_t* statbuf);
-U32 UTIL_isDirectory(const char* infilename);
-int UTIL_getFileStat(const char* infilename, stat_t* statbuf);
+int UTIL_isDirectory(const char* infilename);
 int UTIL_isSameFile(const char* file1, const char* file2);
-int UTIL_compareStr(const void *p1, const void *p2);
 int UTIL_isCompressedFile(const char* infilename, const char *extensionList[]);
-const char* UTIL_getFileExtension(const char* infilename);
+int UTIL_isLink(const char* infilename);
+int UTIL_isFIFO(const char* infilename);

-#ifndef _MSC_VER
-U32 UTIL_isFIFO(const char* infilename);
-#endif
-U32 UTIL_isLink(const char* infilename);
 #define UTIL_FILESIZE_UNKNOWN  ((U64)(-1))
 U64 UTIL_getFileSize(const char* infilename);
-
-U64 UTIL_getTotalFileSize(const char* const * fileNamesTable, unsigned nbFiles);
+U64 UTIL_getTotalFileSize(const char* const * const fileNamesTable, unsigned nbFiles);
+int UTIL_getFileStat(const char* infilename, stat_t* statbuf);
+int UTIL_setFileStat(const char* filename, stat_t* statbuf);
+int UTIL_chmod(char const* filename, mode_t permissions);   /*< like chmod, but avoid changing permission of /dev/null */
+int UTIL_compareStr(const void *p1, const void *p2);
+const char* UTIL_getFileExtension(const char* infilename);


 /*-****************************************
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@ -405,7 +405,7 @@ The list of available _options_:
    Bigger hash tables cause less collisions which usually makes compression
    faster, but requires more memory during compression.

-    The minimum _hlog_ is 6 (64 B) and the maximum is 26 (128 MiB).
+    The minimum _hlog_ is 6 (64 B) and the maximum is 30 (1 GiB).

 - `chainLog`=_clog_, `clog`=_clog_:
    Specify the maximum number of bits for a hash chain or a binary tree.
@ -416,7 +416,8 @@ The list of available _options_:
    compression.
    This option is ignored for the ZSTD_fast strategy.

-    The minimum _clog_ is 6 (64 B) and the maximum is 28 (256 MiB).
+    The minimum _clog_ is 6 (64 B) and the maximum is 29 (524 Mib) on 32-bit platforms
+    and 30 (1 Gib) on 64-bit platforms.

 - `searchLog`=_slog_, `slog`=_slog_:
    Specify the maximum number of searches in a hash chain or a binary tree
@ -425,7 +426,7 @@ The list of available _options_:
    More searches increases the chance to find a match which usually increases
    compression ratio but decreases compression speed.

-    The minimum _slog_ is 1 and the maximum is 26.
+    The minimum _slog_ is 1 and the maximum is 'windowLog' - 1.

 - `minMatch`=_mml_, `mml`=_mml_:
    Specify the minimum searched length of a match in a hash table.
@ -450,7 +451,7 @@ The list of available _options_:

    For all other strategies, this field has no impact.

-    The minimum _tlen_ is 0 and the maximum is 999.
+    The minimum _tlen_ is 0 and the maximum is 128 Kib.

 - `overlapLog`=_ovlog_,  `ovlog`=_ovlog_:
    Determine `overlapSize`, amount of data reloaded from previous job.
@ -473,7 +474,7 @@ The list of available _options_:
    Bigger hash tables usually improve compression ratio at the expense of more
    memory during compression and a decrease in compression speed.

-    The minimum _lhlog_ is 6 and the maximum is 26 (default: 20).
+    The minimum _lhlog_ is 6 and the maximum is 30 (default: 20).

 - `ldmMinMatch`=_lmml_, `lmml`=_lmml_:
    Specify the minimum searched length of a match for long distance matching.
@ -493,7 +494,7 @@ The list of available _options_:
    Larger bucket sizes improve collision resolution but decrease compression
    speed.

-    The minimum _lblog_ is 0 and the maximum is 8 (default: 3).
+    The minimum _lblog_ is 1 and the maximum is 8 (default: 3).

 - `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
    Specify the frequency of inserting entries into the long distance matching
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -1006,9 +1006,7 @@ int main(int argCount, const char* argv[])
        unsigned const nbFilenames = (unsigned)filenames->tableSize;
        for (u=0, fileNamesNb=0; u<nbFilenames; u++) {
            if ( UTIL_isLink(filenames->fileNames[u])
-#ifndef _MSC_VER
             && !UTIL_isFIFO(filenames->fileNames[u])
-#endif /* _MSC_VER */
            ) {
                DISPLAYLEVEL(2, "Warning : %s is a symbolic link, ignoring \n", filenames->fileNames[u]);
            } else {
--- a/tests/fuzz/.gitignore
+++ b/tests/fuzz/.gitignore
@ -2,7 +2,13 @@
 corpora
 block_decompress
 block_round_trip
+dictionary_decompress
+dictionary_loader
+dictionary_round_trip
+simple_compress
 simple_decompress
 simple_round_trip
 stream_decompress
 stream_round_trip
+zstd_frame_info
+fuzz-*.log
--- a/tests/fuzz/dictionary_round_trip.c
+++ b/tests/fuzz/dictionary_round_trip.c
@ -73,7 +73,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)

    size_t const rBufSize = size;
    void* rBuf = malloc(rBufSize);
-    size_t cBufSize = ZSTD_compressBound(size);
+    size_t cBufSize = ZSTD_compressBound(size) * 2;
    void *cBuf;
    /* Half of the time fuzz with a 1 byte smaller output size.
     * This will still succeed because we force the checksum to be disabled,
--- a/tests/fuzz/simple_round_trip.c
+++ b/tests/fuzz/simple_round_trip.c
@ -48,7 +48,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
 {
    size_t const rBufSize = size;
    void* rBuf = malloc(rBufSize);
-    size_t cBufSize = ZSTD_compressBound(size);
+    size_t cBufSize = ZSTD_compressBound(size) * 2;
    void* cBuf;

    /* Give a random portion of src data to the producer, to use for
--- a/tests/fuzz/zstd_helpers.c
+++ b/tests/fuzz/zstd_helpers.c
@ -96,6 +96,9 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
    if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0) {
      setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, producer);
    }
+    if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0) {
+      setRand(cctx, ZSTD_c_targetCBlockSize, ZSTD_TARGETCBLOCKSIZE_MIN, ZSTD_TARGETCBLOCKSIZE_MAX, producer);
+    }
 }

 FUZZ_dict_t FUZZ_train(void const* src, size_t srcSize, FUZZ_dataProducer_t *producer)
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@ -490,6 +490,19 @@ static int basicUnitTests(U32 const seed, double compressibility)
    }
    DISPLAYLEVEL(3, "OK \n");

+    DISPLAYLEVEL(3, "test%3i : compress a NULL input with each level : ", testNb++);
+    {   int level = -1;
+        ZSTD_CCtx* cctx = ZSTD_createCCtx();
+        if (!cctx) goto _output_error;
+        for (level = -1; level <= ZSTD_maxCLevel(); ++level) {
+          CHECK_Z( ZSTD_compress(compressedBuffer, compressedBufferSize, NULL, 0, level) );
+          CHECK_Z( ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level) );
+          CHECK_Z( ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, NULL, 0) );
+        }
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
    DISPLAYLEVEL(3, "test%3d : check CCtx size after compressing empty input : ", testNb++);
    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
        size_t const r = ZSTD_compressCCtx(cctx, compressedBuffer, compressedBufferSize, NULL, 0, 19);
@ -884,6 +897,28 @@ static int basicUnitTests(U32 const seed, double compressibility)
        ZSTDMT_freeCCtx(mtctx);
    }

+    DISPLAYLEVEL(3, "test%3u : compress empty string and decompress with small window log : ", testNb++);
+    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+        ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+        char out[32];
+        if (cctx == NULL || dctx == NULL) goto _output_error;
+        CHECK( ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0) );
+        CHECK_VAR(cSize, ZSTD_compress2(cctx, out, sizeof(out), NULL, 0) );
+        DISPLAYLEVEL(3, "OK (%u bytes)\n", (unsigned)cSize);
+
+        CHECK( ZSTD_DCtx_setParameter(dctx, ZSTD_d_windowLogMax, 10) );
+        {   char const* outPtr = out;
+            ZSTD_inBuffer inBuffer = { outPtr, cSize, 0 };
+            ZSTD_outBuffer outBuffer = { NULL, 0, 0 };
+            size_t dSize;
+            CHECK_VAR(dSize, ZSTD_decompressStream(dctx, &outBuffer, &inBuffer) );
+            if (dSize != 0) goto _output_error;
+        }
+
+        ZSTD_freeDCtx(dctx);
+        ZSTD_freeCCtx(cctx);
+    }
+
    DISPLAYLEVEL(3, "test%3i : compress -T2 with/without literals compression : ", testNb++)
    {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
        size_t cSize1, cSize2;
@ -1137,6 +1172,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
        size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
        size_t dictSize;
        U32 dictID;
+        size_t dictHeaderSize;

        if (dictBuffer==NULL || samplesSizes==NULL) {
            free(dictBuffer);
@ -1226,6 +1262,29 @@ static int basicUnitTests(U32 const seed, double compressibility)
        if (dictID==0) goto _output_error;
        DISPLAYLEVEL(3, "OK : %u \n", (unsigned)dictID);

+        DISPLAYLEVEL(3, "test%3i : check dict header size no error : ", testNb++);
+        dictHeaderSize = ZDICT_getDictHeaderSize(dictBuffer, dictSize);
+        if (dictHeaderSize==0) goto _output_error;
+        DISPLAYLEVEL(3, "OK : %u \n", (unsigned)dictHeaderSize);
+
+        DISPLAYLEVEL(3, "test%3i : check dict header size correctness : ", testNb++);
+        {   unsigned char const dictBufferFixed[144] = { 0x37, 0xa4, 0x30, 0xec, 0x63, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x1f,
+                                                         0x0f, 0x00, 0x28, 0xe5, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                                         0x00, 0x80, 0x0f, 0x9e, 0x0f, 0x00, 0x00, 0x24, 0x40, 0x80, 0x00, 0x01,
+                                                         0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0xde, 0x08,
+                                                         0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+                                                         0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+                                                         0x08, 0x08, 0x08, 0x08, 0xbc, 0xe1, 0x4b, 0x92, 0x0e, 0xb4, 0x7b, 0x18,
+                                                         0x86, 0x61, 0x18, 0xc6, 0x18, 0x63, 0x8c, 0x31, 0xc6, 0x18, 0x63, 0x8c,
+                                                         0x31, 0x66, 0x66, 0x66, 0x66, 0xb6, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x04,
+                                                         0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x20, 0x73, 0x6f, 0x64, 0x61,
+                                                         0x6c, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x72, 0x74, 0x6f, 0x72, 0x20, 0x65,
+                                                         0x6c, 0x65, 0x69, 0x66, 0x65, 0x6e, 0x64, 0x2e, 0x20, 0x41, 0x6c, 0x69 };
+            dictHeaderSize = ZDICT_getDictHeaderSize(dictBufferFixed, 144);
+            if (dictHeaderSize != 115) goto _output_error;
+        }
+        DISPLAYLEVEL(3, "OK : %u \n", (unsigned)dictHeaderSize);
+
        DISPLAYLEVEL(3, "test%3i : compress with dictionary : ", testNb++);
        cSize = ZSTD_compress_usingDict(cctx, compressedBuffer, compressedBufferSize,
                                        CNBuffer, CNBuffSize,
@ -1999,14 +2058,14 @@ static int basicUnitTests(U32 const seed, double compressibility)

    /* long rle test */
    {   size_t sampleSize = 0;
+        size_t expectedCompressedSize = 39; /* block 1, 2: compressed, block 3: RLE, zstd 1.4.4 */
        DISPLAYLEVEL(3, "test%3i : Long RLE test : ", testNb++);
-        RDG_genBuffer(CNBuffer, sampleSize, compressibility, 0., seed+1);
        memset((char*)CNBuffer+sampleSize, 'B', 256 KB - 1);
        sampleSize += 256 KB - 1;
-        RDG_genBuffer((char*)CNBuffer+sampleSize, 96 KB, compressibility, 0., seed+2);
+        memset((char*)CNBuffer+sampleSize, 'A', 96 KB);
        sampleSize += 96 KB;
        cSize = ZSTD_compress(compressedBuffer, ZSTD_compressBound(sampleSize), CNBuffer, sampleSize, 1);
-        if (ZSTD_isError(cSize)) goto _output_error;
+        if (ZSTD_isError(cSize) || cSize > expectedCompressedSize) goto _output_error;
        { CHECK_NEWV(regenSize, ZSTD_decompress(decodedBuffer, sampleSize, compressedBuffer, cSize));
          if (regenSize!=sampleSize) goto _output_error; }
        DISPLAYLEVEL(3, "OK \n");
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@ -92,6 +92,11 @@ case "$UNAME" in
  *) MD5SUM="md5sum" ;;
 esac

+MTIME="stat -c %Y"
+case "$UNAME" in
+    Darwin | FreeBSD | OpenBSD) MTIME="stat -f %m" ;;
+esac
+
 DIFF="diff"
 case "$UNAME" in
  SunOS) DIFF="gdiff" ;;
@ -215,20 +220,19 @@ $ZSTD tmp -c --compress-literals    -19      | $ZSTD -t
 $ZSTD -b --fast=1 -i0e1 tmp --compress-literals
 $ZSTD -b --fast=1 -i0e1 tmp --no-compress-literals

-println "test: --exclude-compressed flag"
+println "\n===>  --exclude-compressed flag"
 rm -rf precompressedFilterTestDir
 mkdir -p precompressedFilterTestDir
 ./datagen $size > precompressedFilterTestDir/input.5
 ./datagen $size > precompressedFilterTestDir/input.6
 $ZSTD --exclude-compressed --long --rm -r precompressedFilterTestDir
-sleep 5
 ./datagen $size > precompressedFilterTestDir/input.7
 ./datagen $size > precompressedFilterTestDir/input.8
 $ZSTD --exclude-compressed --long --rm -r precompressedFilterTestDir
 test ! -f precompressedFilterTestDir/input.5.zst.zst
 test ! -f precompressedFilterTestDir/input.6.zst.zst
-file1timestamp=`date -r precompressedFilterTestDir/input.5.zst +%s`
-file2timestamp=`date -r precompressedFilterTestDir/input.7.zst +%s`
+file1timestamp=`$MTIME precompressedFilterTestDir/input.5.zst`
+file2timestamp=`$MTIME precompressedFilterTestDir/input.7.zst`
 if [[ $file2timestamp -ge $file1timestamp ]]; then
  println "Test is successful. input.5.zst is precompressed and therefore not compressed/modified again."
 else
@ -246,7 +250,7 @@ test -f precompressedFilterTestDir/input.5.zst.zst
 test -f precompressedFilterTestDir/input.6.zst.zst
 println "Test completed"

-println "test : file removal"
+println "\n===>  file removal"
 $ZSTD -f --rm tmp
 test ! -f tmp  # tmp should no longer be present
 $ZSTD -f -d --rm tmp.zst
@ -275,13 +279,16 @@ rm -rf tmp*  # may also erase tmp* directory from previous failed run


 println "\n===>  decompression only tests "
-head -c 1048576 /dev/zero > tmp
+# the following test verifies that the decoder is compatible with RLE as first block
+# older versions of zstd cli are not able to decode such corner case.
+# As a consequence, the zstd cli do not generate them, to maintain compatibility with older versions.
+dd bs=1048576 count=1 if=/dev/zero of=tmp
 $ZSTD -d -o tmp1 "$TESTDIR/golden-decompression/rle-first-block.zst"
 $DIFF -s tmp1 tmp
 rm tmp*


-println "test : compress multiple files"
+println "\n===>  compress multiple files"
 println hello > tmp1
 println world > tmp2
 $ZSTD tmp1 tmp2 -o "$INTOVOID" -f
@ -328,7 +335,22 @@ $ZSTD -f tmp1 notHere tmp2 && die "missing file not detected!"
 rm tmp*


-println "test : compress multiple files into an output directory, --output-dir-flat"
+if [ -n "$DEVNULLRIGHTS" ]
+then
+    # these tests requires sudo rights, which is uncommon.
+    # they are only triggered if DEVNULLRIGHTS macro is defined.
+    println "\n===> checking /dev/null permissions are unaltered "
+    ./datagen > tmp
+    sudo $ZSTD tmp -o $INTOVOID   # sudo rights could modify /dev/null permissions
+    sudo $ZSTD tmp -c > $INTOVOID
+    $ZSTD tmp -f -o tmp.zst
+    sudo $ZSTD -d tmp.zst -c > $INTOVOID
+    sudo $ZSTD -d tmp.zst -o $INTOVOID
+    ls -las $INTOVOID | grep "rw-rw-rw-"
+fi
+
+
+println "\n===>  compress multiple files into an output directory, --output-dir-flat"
 println henlo > tmp1
 mkdir tmpInputTestDir
 mkdir tmpInputTestDir/we
@ -436,7 +458,6 @@ $ZSTD -dcf tmp1


 println "\n===>  frame concatenation "
-
 println "hello " > hello.tmp
 println "world!" > world.tmp
 cat hello.tmp world.tmp > helloworld.tmp
@ -1241,9 +1262,9 @@ rm -f tmp* dictionary
 if [ "$isWindows" = false ] ; then

 println "\n===>  zstd fifo named pipe test "
-head -c 10 /dev/zero > tmp_original
+dd bs=1 count=10 if=/dev/zero of=tmp_original
 mkfifo named_pipe
-head -c 10 /dev/zero > named_pipe &
+dd bs=1 count=10 if=/dev/zero of=named_pipe &
 $ZSTD named_pipe -o tmp_compressed
 $ZSTD -d -o tmp_decompressed tmp_compressed
 $DIFF -s tmp_original tmp_decompressed
--- a/tests/zstreamtest.c
+++ b/tests/zstreamtest.c
@ -509,6 +509,33 @@ static int basicUnitTests(U32 seed, double compressibility)
    }
    DISPLAYLEVEL(3, "OK \n");

+    DISPLAYLEVEL(3, "test%3i : NULL buffers : ", testNb++);
+    inBuff.src = NULL;
+    inBuff.size = 0;
+    inBuff.pos = 0;
+    outBuff.dst = NULL;
+    outBuff.size = 0;
+    outBuff.pos = 0;
+    CHECK_Z( ZSTD_compressStream(zc, &outBuff, &inBuff) );
+    CHECK(inBuff.pos != inBuff.size, "Entire input should be consumed");
+    CHECK_Z( ZSTD_endStream(zc, &outBuff) );
+    outBuff.dst = (char*)(compressedBuffer);
+    outBuff.size = compressedBufferSize;
+    outBuff.pos = 0;
+    {   size_t const r = ZSTD_endStream(zc, &outBuff);
+        CHECK(r != 0, "Error or some data not flushed (ret=%zu)", r);
+    }
+    inBuff.src = outBuff.dst;
+    inBuff.size = outBuff.pos;
+    inBuff.pos = 0;
+    outBuff.dst = NULL;
+    outBuff.size = 0;
+    outBuff.pos = 0;
+    CHECK_Z( ZSTD_initDStream(zd) );
+    {   size_t const ret = ZSTD_decompressStream(zd, &outBuff, &inBuff);
+        if (ret != 0) goto _output_error;
+    }
+    DISPLAYLEVEL(3, "OK\n");
    /* _srcSize compression test */
    DISPLAYLEVEL(3, "test%3i : compress_srcSize %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
    CHECK_Z( ZSTD_initCStream_srcSize(zc, 1, CNBufferSize) );
--- a/zlibWrapper/Makefile
+++ b/zlibWrapper/Makefile
@ -20,8 +20,8 @@ TEST_FILE = ../doc/zstd_compression_format.md

 CPPFLAGS += -DXXH_NAMESPACE=ZSTD_ -I$(ZLIB_PATH) -I$(PROGRAMS_PATH)       \
            -I$(ZSTDLIBDIR) -I$(ZSTDLIBDIR)/common -I$(ZLIBWRAPPER_PATH)
-STDFLAGS  = -std=c90 -pedantic -Wno-long-long -Wno-variadic-macros -Wc++-compat \
-            -DNO_snprintf -DNO_vsnprintf  # strict ISO C90 is missing these prototypes
+STDFLAGS  = -std=c89 -pedantic -Wno-long-long -Wno-variadic-macros -Wc++-compat \
+            -DNO_snprintf -DNO_vsnprintf  # strict ANSI C89 is missing these prototypes
 DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow -Wswitch-enum \
            -Wdeclaration-after-statement -Wstrict-prototypes -Wundef     \
            -Wstrict-aliasing=1