diff --git a/Makefile b/Makefile
index 8049649d..c778cb72 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@
 # ################################################################
 
 # Version number
-export VERSION=0.0.2
+export VERSION=0.1.0
 export RELEASE=r$(VERSION)
 
 DESTDIR?=
@@ -93,6 +93,7 @@ prg-travis:
 	@cd $(PRGDIR); $(MAKE) -e $(ZSTD_TRAVIS_CI_ENV)
 
 clangtest: clean
+	clang -v
 	$(MAKE) all CC=clang MOREFLAGS="-Werror -Wconversion -Wno-sign-conversion"
 
 gpptest: clean
diff --git a/README.md b/README.md
old mode 100755
new mode 100644
index d53bb38f..ccb96595
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
- **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio (which LZMA and ZPAQ cover) nor extreme speeds (which LZ4 covers).
+ **Zstd**, short for Zstandard, is a new lossless compression algorithm, which provides both good compression ratio _and_ speed for your standard compression needs. "Standard" translates into everyday situations which neither look for highest possible ratio nor extreme speed.
 
 It is provided as a BSD-license package, hosted on Github.
 
@@ -7,40 +7,42 @@ It is provided as a BSD-license package, hosted on Github.
 |master      | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=master)](https://travis-ci.org/Cyan4973/zstd) |
 |dev         | [![Build Status](https://travis-ci.org/Cyan4973/zstd.svg?branch=dev)](https://travis-ci.org/Cyan4973/zstd) |
 
-For a taste of its performance, here are a few benchmark numbers, completed on a Core i5-4300U @ 1.9 GHz, using [fsbench 0.14.3](http://encode.ru/threads/1371-Filesystem-benchmark?p=34029&viewfull=1#post34029), an open-source benchmark program by m^2.
+For a taste of its performance, here are a few benchmark numbers, completed on a Core i7-5600U @ 2.6 GHz, using [fsbench 0.14.3](http://encode.ru/threads/1371-Filesystem-benchmark?p=34029&viewfull=1#post34029), an open-source benchmark program by m^2.
 
-|Name           | Ratio | C.speed | D.speed |
-|---------------|-------|---------|---------|
-|               |       |   MB/s  |  MB/s   |
-| [zlib 1.2.8 -6](http://www.zlib.net/)| 3.099 |    18   |  275    |
-| **zstd**      |**2.872**|**201**|**498**  |
-| [zlib 1.2.8 -1](http://www.zlib.net/)| 2.730 |    58   |   250   |
-| [LZ4 HC r127](https://github.com/Cyan4973/lz4)| 2.720 |   26    |  1720   |
-| QuickLZ 1.5.1b6|2.237 |  323    |  373    |
-| LZO 2.06      | 2.106 |  351    |  510    |
-| Snappy 1.1.0  | 2.091 |  238    |  964    |
-| [LZ4 r127](https://github.com/Cyan4973/lz4)| 2.084 |  370    | 1590    |
-| LZF 3.6       | 2.077 |  220    |  502    |
+|Name            | Ratio | C.speed | D.speed |
+|----------------|-------|--------:|--------:|
+|                |       |   MB/s  |  MB/s   |
+| [zlib 1.2.8] -6| 3.099 |    21   |   320   |
+| **zstd**       |**2.871**|**255**| **628** |
+| [zlib 1.2.8] -1| 2.730 |    70   |   300   | 
+| [LZ4] HC r131  | 2.720 |    25   |  2100   |
+| QuickLZ 1.5.1b6| 2.237 |   370   |   415   |
+| LZO 2.06       | 2.106 |   400   |   580   |
+| Snappy 1.1.0   | 2.091 |   330   |  1100   |
+| [LZ4] r131     | 2.101 |   450   |  2100   |
+| LZF 3.6        | 2.077 |   200   |   560   |
+
+[zlib 1.2.8]:http://www.zlib.net/
+[LZ4]:http://www.lz4.org/
 
 An interesting feature of zstd is that it can qualify as both a reasonably strong compressor and a fast one.
 
-Zstd delivers high decompression speed, at around ~500 MB/s per core.
+Zstd delivers high decompression speed, at more than >600 MB/s per core.
 Obviously, your exact mileage will vary depending on your target system.
 
-Zstd compression speed, on the other hand, can be configured to fit different situations.
-The first, fast, derivative offers ~200 MB/s per core, which is suitable for a few real-time scenarios.
-But similar to [LZ4](https://github.com/Cyan4973/lz4), zstd can offer derivatives trading compression time for compression ratio, while keeping decompression properties intact. "Offline compression", where compression time is of little importance because the content is only compressed once and decompressed many times, is therefore within the scope.
+Zstd compression speed will be configurable to fit different situations.
+The first version offered is the fast one, at ~250 MB/s per core, which is suitable for a few real-time scenarios.
+But similar to [LZ4], zstd can offer derivatives trading compression time for compression ratio, keeping decompression properties intact. "Offline compression", where compression time is of little importance because the content is only compressed once and decompressed many times, is therefore within scope.
 
 Note that high compression derivatives still have to be developed.
-It's a complex area which will certainly benefit the contributions from a few experts.
+It's a complex area which will require time and benefit from contributions.
 
 
 Another property zstd is developed for is configurable memory requirement, with the objective to fit into low-memory configurations, or servers handling many connections in parallel.
 
-Zstd entropy stage is provided by [FSE (Finite State Entropy)](https://github.com/Cyan4973/FiniteStateEntropy).
+Zstd entropy stage is provided by [Huff0 and FSE, from Finite State Entrop library](https://github.com/Cyan4973/FiniteStateEntropy).
 
-Zstd development is starting. So consider current results merely as early ones. The implementation will gradually evolve and improve overtime, especially during this first year. This is a phase which will depend a lot on user feedback, since these feedback will be key in deciding next priorities or features to add.
+Zstd is still considered experimental at this stage. Specifically, it doesn't guarantee yet that its current stream/file format will remain supported in future versions of the library. Therefore, only use Zstd in environments where you can control the availability of the decompression library. "Stable" status, including official documented format format and long-term support commitment, is projected sometimes early 2016.
 
-The "master" branch is reserved for stable release and betas.
-The "dev" branch is the one where all contributions will be merged. If you plan to propose a patch, please commit into the "dev" branch. Direct commit to "master" are not permitted.
-Feature branches will also exist, typically to introduce new requirements, and be temporarily available for testing before merge into "dev" branch.
+### Branch Policy
+The "dev" branch is the one where all contributions will be merged before reaching "master". If you plan to propose a patch, please commit into the "dev" branch or its own feature branch. Direct commit to "master" are not permitted.
diff --git a/lib/fse.c b/lib/fse.c
index b0318f15..2c55a563 100644
--- a/lib/fse.c
+++ b/lib/fse.c
@@ -52,11 +52,22 @@
 
 
 /****************************************************************
-*  Generic function type & suffix (C template emulation)
+*  template functions type & suffix
 ****************************************************************/
 #define FSE_FUNCTION_TYPE BYTE
 #define FSE_FUNCTION_EXTENSION
 
+
+/****************************************************************
+*  Byte symbol type
+****************************************************************/
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
 #endif   /* !FSE_COMMONDEFS_ONLY */
 
 
@@ -116,17 +127,121 @@ typedef   signed long long  S64;
 /****************************************************************
 *  Memory I/O
 *****************************************************************/
+/* FSE_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef FSE_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define FSE_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define FSE_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+
+static unsigned FSE_32bits(void)
+{
+    return sizeof(void*)==4;
+}
+
 static unsigned FSE_isLittleEndian(void)
 {
     const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
     return one.c[0];
 }
 
+#if defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==2)
+
+static U16 FSE_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 FSE_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+static void FSE_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void FSE_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+static void FSE_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+static void FSE_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+static void FSE_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+static void FSE_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
+
+#else
+
+static U16 FSE_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
 static U32 FSE_read32(const void* memPtr)
 {
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U64 FSE_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void FSE_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+static void FSE_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+static void FSE_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif // FSE_FORCE_MEMORY_ACCESS
+
+static U16 FSE_readLE16(const void* memPtr)
+{
+    if (FSE_isLittleEndian())
+        return FSE_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+static void FSE_writeLE16(void* memPtr, U16 val)
+{
+    if (FSE_isLittleEndian())
+    {
+        FSE_write16(memPtr, val);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
 }
 
 static U32 FSE_readLE32(const void* memPtr)
@@ -144,7 +259,7 @@ static void FSE_writeLE32(void* memPtr, U32 val32)
 {
     if (FSE_isLittleEndian())
     {
-        memcpy(memPtr, &val32, 4);
+        FSE_write32(memPtr, val32);
     }
     else
     {
@@ -156,13 +271,6 @@ static void FSE_writeLE32(void* memPtr, U32 val32)
     }
 }
 
-static U64 FSE_read64(const void* memPtr)
-{
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
-}
-
 static U64 FSE_readLE64(const void* memPtr)
 {
     if (FSE_isLittleEndian())
@@ -179,7 +287,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64)
 {
     if (FSE_isLittleEndian())
     {
-        memcpy(memPtr, &val64, 8);
+        FSE_write64(memPtr, val64);
     }
     else
     {
@@ -197,7 +305,7 @@ static void FSE_writeLE64(void* memPtr, U64 val64)
 
 static size_t FSE_readLEST(const void* memPtr)
 {
-    if (sizeof(size_t)==4)
+    if (FSE_32bits())
         return (size_t)FSE_readLE32(memPtr);
     else
         return (size_t)FSE_readLE64(memPtr);
@@ -205,7 +313,7 @@ static size_t FSE_readLEST(const void* memPtr)
 
 static void FSE_writeLEST(void* memPtr, size_t val)
 {
-    if (sizeof(size_t)==4)
+    if (FSE_32bits())
         FSE_writeLE32(memPtr, (U32)val);
     else
         FSE_writeLE64(memPtr, (U64)val);
@@ -238,11 +346,9 @@ static void FSE_writeLEST(void* memPtr, size_t val)
 ****************************************************************/
 typedef struct
 {
-    int  deltaFindState;
-    U16  maxState;
-    BYTE minBitsOut;
-    /* one byte padding ; total 8 bytes */
-} FSE_symbolCompressionTransform;
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
 
 typedef U32 CTable_max_t[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
 typedef U32 DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
@@ -273,1060 +379,10 @@ FORCE_INLINE unsigned FSE_highbit32 (register U32 val)
 }
 
 
-#ifndef FSE_COMMONDEFS_ONLY
-
-unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
-
-#define FSE_GENERATE_STRING(STRING) #STRING,
-static const char* FSE_errorStrings[] = { FSE_LIST_ERRORS(FSE_GENERATE_STRING) };
-
-const char* FSE_getErrorName(size_t code)
-{
-    static const char* codeError = "Unspecified error code";
-    if (FSE_isError(code)) return FSE_errorStrings[-(int)(code)];
-    return codeError;
-}
-
-static short FSE_abs(short a)
-{
-    return a<0? -a : a;
-}
-
-
 /****************************************************************
-*  Header bitstream management
-****************************************************************/
-size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 1;
-    return maxSymbolValue ? maxHeaderSize : FSE_MAX_HEADERSIZE;
-}
-
-#ifndef __clang_analyzer__   /* clang static analyzer has difficulties with this function : seems to believe normalizedCounter is uninitialized */
-
-static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
-                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
-                                       unsigned safeWrite)
-{
-    BYTE* const ostart = (BYTE*) header;
-    BYTE* out = ostart;
-    BYTE* const oend = ostart + headerBufferSize;
-    int nbBits;
-    const int tableSize = 1 << tableLog;
-    int remaining;
-    int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
-
-    bitStream = 0;
-    bitCount  = 0;
-    /* Table Size */
-    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
-    bitCount  += 4;
-
-    /* Init */
-    remaining = tableSize+1;   /* +1 for extra accuracy */
-    threshold = tableSize;
-    nbBits = tableLog+1;
-
-    while (remaining>1)   /* stops at 1 */
-    {
-        if (previous0)
-        {
-            unsigned start = charnum;
-            while (!normalizedCounter[charnum]) charnum++;
-            while (charnum >= start+24)
-            {
-                start+=24;
-                bitStream += 0xFFFFU << bitCount;
-                if ((!safeWrite) && (out > oend-2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-                out[0] = (BYTE) bitStream;
-                out[1] = (BYTE)(bitStream>>8);
-                out+=2;
-                bitStream>>=16;
-            }
-            while (charnum >= start+3)
-            {
-                start+=3;
-                bitStream += 3 << bitCount;
-                bitCount += 2;
-            }
-            bitStream += (charnum-start) << bitCount;
-            bitCount += 2;
-            if (bitCount>16)
-            {
-                if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-                out[0] = (BYTE)bitStream;
-                out[1] = (BYTE)(bitStream>>8);
-                out += 2;
-                bitStream >>= 16;
-                bitCount -= 16;
-            }
-        }
-        {
-            short count = normalizedCounter[charnum++];
-            const short max = (short)((2*threshold-1)-remaining);
-            remaining -= FSE_abs(count);
-            if (remaining<1) return (size_t)-FSE_ERROR_GENERIC;
-            count++;   /* +1 for extra accuracy */
-            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
-            bitStream += count << bitCount;
-            bitCount  += nbBits;
-            bitCount  -= (count<max);
-            previous0 = (count==1);
-            while (remaining<threshold) nbBits--, threshold>>=1;
-        }
-        if (bitCount>16)
-        {
-            if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-            out[0] = (BYTE)bitStream;
-            out[1] = (BYTE)(bitStream>>8);
-            out += 2;
-            bitStream >>= 16;
-            bitCount -= 16;
-        }
-    }
-
-    /* flush remaining bitStream */
-    if ((!safeWrite) && (out > oend - 2)) return (size_t)-FSE_ERROR_GENERIC;   /* Buffer overflow */
-    out[0] = (BYTE)bitStream;
-    out[1] = (BYTE)(bitStream>>8);
-    out+= (bitCount+7) /8;
-
-    if (charnum > maxSymbolValue + 1) return (size_t)-FSE_ERROR_GENERIC;   /* Too many symbols written (a bit too late?) */
-
-    return (out-ostart);
-}
-#endif // __clang_analyzer__
-
-
-size_t FSE_writeNCount (void* header, size_t headerBufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-{
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
-    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
-
-    if (headerBufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
-        return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
-
-    return FSE_writeNCount_generic(header, headerBufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
-}
-
-
-size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
-                 const void* headerBuffer, size_t hbSize)
-{
-    const BYTE* const istart = (const BYTE*) headerBuffer;
-    const BYTE* const iend = istart + hbSize;
-    const BYTE* ip = istart;
-    int nbBits;
-    int remaining;
-    int threshold;
-    U32 bitStream;
-    int bitCount;
-    unsigned charnum = 0;
-    int previous0 = 0;
-
-    if (hbSize < 4) return (size_t)-FSE_ERROR_srcSize_wrong;
-    bitStream = FSE_readLE32(ip);
-    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
-    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
-    bitStream >>= 4;
-    bitCount = 4;
-    *tableLogPtr = nbBits;
-    remaining = (1<<nbBits)+1;
-    threshold = 1<<nbBits;
-    nbBits++;
-
-    while ((remaining>1) && (charnum<=*maxSVPtr))
-    {
-        if (previous0)
-        {
-            unsigned n0 = charnum;
-            while ((bitStream & 0xFFFF) == 0xFFFF)
-            {
-                n0+=24;
-                ip+=2;
-                bitStream = FSE_readLE32(ip) >> bitCount;
-            }
-            while ((bitStream & 3) == 3)
-            {
-                n0+=3;
-                bitStream>>=2;
-                bitCount+=2;
-            }
-            n0 += bitStream & 3;
-            bitCount += 2;
-            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
-            while (charnum < n0) normalizedCounter[charnum++] = 0;
-            ip += bitCount>>3;
-            bitCount &= 7;
-            bitStream = FSE_readLE32(ip) >> bitCount;
-        }
-        {
-            const short max = (short)((2*threshold-1)-remaining);
-            short count;
-
-            if ((bitStream & (threshold-1)) < (U32)max)
-            {
-                count = (short)(bitStream & (threshold-1));
-                bitCount   += nbBits-1;
-            }
-            else
-            {
-                count = (short)(bitStream & (2*threshold-1));
-                if (count >= threshold) count -= max;
-                bitCount   += nbBits;
-            }
-
-            count--;   /* extra accuracy */
-            remaining -= FSE_abs(count);
-            normalizedCounter[charnum++] = count;
-            previous0 = !count;
-            while (remaining < threshold)
-            {
-                nbBits--;
-                threshold >>= 1;
-            }
-
-            {
-                const BYTE* itarget = ip + (bitCount>>3);
-                if (itarget > iend - 4)
-                {
-                    ip = iend - 4;
-                    bitCount -= (int)(8 * (iend - 4 - ip));
-                }
-                else
-                {
-                    ip = itarget;
-                    bitCount &= 7;
-                }
-                bitStream = FSE_readLE32(ip) >> (bitCount & 31);
-            }
-        }
-    }
-    if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
-    *maxSVPtr = charnum-1;
-
-    ip += (bitCount+7)>>3;
-    if ((size_t)(ip-istart) > hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;
-    return ip-istart;
-}
-
-
-/****************************************************************
-*  FSE Compression Code
+*  Templates
 ****************************************************************/
 /*
-FSE_CTable[0] is a variable size structure which contains :
-    U16 tableLog;
-    U16 maxSymbolValue;
-    U16 nextStateNumber[1 << tableLog];                         // This size is variable
-    FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];  // This size is variable
-Allocation is manual, since C standard does not support variable-size structures.
-*/
-
-size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    FSE_STATIC_ASSERT((size_t)FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)*4 >= sizeof(CTable_max_t));   /* A compilation error here means FSE_CTABLE_SIZE_U32 is not large enough */
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return size;
-}
-
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return (FSE_CTable*)malloc(size);
-}
-
-void  FSE_freeCTable (FSE_CTable* ct)
-{
-    free(ct);
-}
-
-
-unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
-{
-    U32 tableLog = maxTableLog;
-    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-    if ((FSE_highbit32((U32)(srcSize - 1)) - 2) < tableLog) tableLog = FSE_highbit32((U32)(srcSize - 1)) - 2;   /* Accuracy can be reduced */
-    if ((FSE_highbit32(maxSymbolValue)+2) > tableLog) tableLog = FSE_highbit32(maxSymbolValue)+2;   /* Need a minimum to safely represent all symbol values */
-    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
-    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
-    return tableLog;
-}
-
-
-typedef struct
-{
-    U32 id;
-    U32 count;
-} rank_t;
-
-int FSE_compareRankT(const void* r1, const void* r2)
-{
-    const rank_t* R1 = (const rank_t*)r1;
-    const rank_t* R2 = (const rank_t*)r2;
-
-    return 2 * (R1->count < R2->count) - 1;
-}
-
-
-#if 0
-static size_t FSE_adjustNormSlow(short* norm, int pointsToRemove, const unsigned* count, U32 maxSymbolValue)
-{
-    rank_t rank[FSE_MAX_SYMBOL_VALUE+2];
-    U32 s;
-
-    /* Init */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        rank[s].id = s;
-        rank[s].count = count[s];
-        if (norm[s] <= 1) rank[s].count = 0;
-    }
-    rank[maxSymbolValue+1].id = 0;
-    rank[maxSymbolValue+1].count = 0;   /* ensures comparison ends here in worst case */
-
-    /* Sort according to count */
-    qsort(rank, maxSymbolValue+1, sizeof(rank_t), FSE_compareRankT);
-
-    while(pointsToRemove)
-    {
-        int newRank = 1;
-        rank_t savedR;
-        if (norm[rank[0].id] == 1)
-            return (size_t)-FSE_ERROR_GENERIC;
-        norm[rank[0].id]--;
-        pointsToRemove--;
-        rank[0].count -= (rank[0].count + 6) >> 3;
-        if (norm[rank[0].id] == 1)
-            rank[0].count=0;
-        savedR = rank[0];
-        while (rank[newRank].count > savedR.count)
-        {
-            rank[newRank-1] = rank[newRank];
-            newRank++;
-        }
-        rank[newRank-1] = savedR;
-    }
-
-    return 0;
-}
-
-#else
-
-/* Secondary normalization method.
-   To be used when primary method fails. */
-
-static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
-{
-    U32 s;
-    U32 distributed = 0;
-    U32 ToDistribute;
-
-    /* Init */
-    U32 lowThreshold = (U32)(total >> tableLog);
-    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
-
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        if (count[s] == 0)
-        {
-            norm[s]=0;
-            continue;
-        }
-        if (count[s] <= lowThreshold)
-        {
-            norm[s] = -1;
-            distributed++;
-            total -= count[s];
-            continue;
-        }
-        if (count[s] <= lowOne)
-        {
-            norm[s] = 1;
-            distributed++;
-            total -= count[s];
-            continue;
-        }
-        norm[s]=-2;
-    }
-    ToDistribute = (1 << tableLog) - distributed;
-
-    if ((total / ToDistribute) > lowOne)
-    {
-        /* risk of rounding to zero */
-        lowOne = (U32)((total * 3) / (ToDistribute * 2));
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if ((norm[s] == -2) && (count[s] <= lowOne))
-            {
-                norm[s] = 1;
-                distributed++;
-                total -= count[s];
-                continue;
-            }
-        }
-        ToDistribute = (1 << tableLog) - distributed;
-    }
-
-    if (distributed == maxSymbolValue+1)
-    {
-        /* all values are pretty poor;
-           probably incompressible data (should have already been detected);
-           find max, then give all remaining points to max */
-        U32 maxV = 0, maxC =0;
-        for (s=0; s<=maxSymbolValue; s++)
-            if (count[s] > maxC) maxV=s, maxC=count[s];
-        norm[maxV] += (short)ToDistribute;
-        return 0;
-    }
-
-    {
-        U64 const vStepLog = 62 - tableLog;
-        U64 const mid = (1ULL << (vStepLog-1)) - 1;
-        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
-        U64 tmpTotal = mid;
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (norm[s]==-2)
-            {
-                U64 end = tmpTotal + (count[s] * rStep);
-                U32 sStart = (U32)(tmpTotal >> vStepLog);
-                U32 sEnd = (U32)(end >> vStepLog);
-                U32 weight = sEnd - sStart;
-                if (weight < 1)
-                    return (size_t)-FSE_ERROR_GENERIC;
-                norm[s] = (short)weight;
-                tmpTotal = end;
-            }
-        }
-    }
-
-    return 0;
-}
-#endif
-
-
-size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
-                           const unsigned* count, size_t total,
-                           unsigned maxSymbolValue)
-{
-    /* Sanity checks */
-    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
-    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
-    if ((1U<<tableLog) <= maxSymbolValue) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
-
-    {
-        U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
-        U64 const scale = 62 - tableLog;
-        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
-        U64 const vStep = 1ULL<<(scale-20);
-        int stillToDistribute = 1<<tableLog;
-        unsigned s;
-        unsigned largest=0;
-        short largestP=0;
-        U32 lowThreshold = (U32)(total >> tableLog);
-
-        for (s=0; s<=maxSymbolValue; s++)
-        {
-            if (count[s] == total) return 0;
-            if (count[s] == 0)
-            {
-                normalizedCounter[s]=0;
-                continue;
-            }
-            if (count[s] <= lowThreshold)
-            {
-                normalizedCounter[s] = -1;
-                stillToDistribute--;
-            }
-            else
-            {
-                short proba = (short)((count[s]*step) >> scale);
-                if (proba<8)
-                {
-                    U64 restToBeat = vStep * rtbTable[proba];
-                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
-                }
-                if (proba > largestP)
-                {
-                    largestP=proba;
-                    largest=s;
-                }
-                normalizedCounter[s] = proba;
-                stillToDistribute -= proba;
-            }
-        }
-        if (-stillToDistribute >= (normalizedCounter[largest] >> 1))
-        {
-            /* corner case, need another normalization method */
-            size_t errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
-            if (FSE_isError(errorCode)) return errorCode;
-        }
-        else normalizedCounter[largest] += (short)stillToDistribute;
-    }
-
-#if 0
-    {   /* Print Table (debug) */
-        U32 s;
-        U32 nTotal = 0;
-        for (s=0; s<=maxSymbolValue; s++)
-            printf("%3i: %4i \n", s, normalizedCounter[s]);
-        for (s=0; s<=maxSymbolValue; s++)
-            nTotal += abs(normalizedCounter[s]);
-        if (nTotal != (1U<<tableLog))
-            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
-        getchar();
-    }
-#endif
-
-    return tableLog;
-}
-
-
-/* fake FSE_CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
-{
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    U16* tableU16 = ( (U16*) ct) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((((U32*)ct)+1) + (tableSize>>1));
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-
-    /* Build Symbol Transformation Table */
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        symbolTT[s].minBitsOut = (BYTE)nbBits;
-        symbolTT[s].deltaFindState = s-1;
-        symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
-    }
-
-    return 0;
-}
-
-
-/* fake FSE_CTable, for rle (100% always same symbol) input */
-size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
-{
-    const unsigned tableSize = 1;
-    U16* tableU16 = ( (U16*) ct) + 2;
-    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)ct + 2);
-
-    /* header */
-    tableU16[-2] = (U16) 0;
-    tableU16[-1] = (U16) symbolValue;
-
-    /* Build table */
-    tableU16[0] = 0;
-    tableU16[1] = 0;   /* just in case */
-
-    /* Build Symbol Transformation Table */
-    {
-        symbolTT[symbolValue].minBitsOut = 0;
-        symbolTT[symbolValue].deltaFindState = 0;
-        symbolTT[symbolValue].maxState = (U16)(2*tableSize-1);   /* ensures state <= maxState */
-    }
-
-    return 0;
-}
-
-
-void FSE_initCStream(FSE_CStream_t* bitC, void* start)
-{
-    bitC->bitContainer = 0;
-    bitC->bitPos = 0;   /* reserved for unusedBits */
-    bitC->startPtr = (char*)start;
-    bitC->ptr = bitC->startPtr;
-}
-
-void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
-{
-    const U32 tableLog = ( (const U16*) ct) [0];
-    statePtr->value = (ptrdiff_t)1<<tableLog;
-    statePtr->stateTable = ((const U16*) ct) + 2;
-    statePtr->symbolTT = (const FSE_symbolCompressionTransform*)((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
-    statePtr->stateLog = tableLog;
-}
-
-void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits)
-{
-    static const unsigned mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF };   /* up to 25 bits */
-    bitC->bitContainer |= (value & mask[nbBits]) << bitC->bitPos;
-    bitC->bitPos += nbBits;
-}
-
-void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, BYTE symbol)
-{
-    const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
-    const U16* const stateTable = (const U16*)(statePtr->stateTable);
-    int nbBitsOut  = symbolTT.minBitsOut;
-    nbBitsOut -= (int)((symbolTT.maxState - statePtr->value) >> 31);
-    FSE_addBits(bitC, statePtr->value, nbBitsOut);
-    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
-}
-
-void FSE_flushBits(FSE_CStream_t* bitC)
-{
-    size_t nbBytes = bitC->bitPos >> 3;
-    FSE_writeLEST(bitC->ptr, bitC->bitContainer);
-    bitC->bitPos &= 7;
-    bitC->ptr += nbBytes;
-    bitC->bitContainer >>= nbBytes*8;
-}
-
-void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* statePtr)
-{
-    FSE_addBits(bitC, statePtr->value, statePtr->stateLog);
-    FSE_flushBits(bitC);
-}
-
-
-size_t FSE_closeCStream(FSE_CStream_t* bitC)
-{
-    char* endPtr;
-
-    FSE_addBits(bitC, 1, 1);
-    FSE_flushBits(bitC);
-
-    endPtr = bitC->ptr;
-    endPtr += bitC->bitPos > 0;
-
-    return (endPtr - bitC->startPtr);
-}
-
-
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
-                           const void* src, size_t srcSize,
-                           const FSE_CTable* ct)
-{
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip;
-    const BYTE* const iend = istart + srcSize;
-
-    FSE_CStream_t bitC;
-    FSE_CState_t CState1, CState2;
-
-
-    /* init */
-    (void)dstSize;   /* objective : ensure it fits into dstBuffer (Todo) */
-    FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, ct);
-    CState2 = CState1;
-
-    ip=iend;
-
-    /* join to even */
-    if (srcSize & 1)
-    {
-        FSE_encodeSymbol(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
-    }
-
-    /* join to mod 4 */
-    if ((sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *--ip);
-        FSE_encodeSymbol(&bitC, &CState1, *--ip);
-        FSE_flushBits(&bitC);
-    }
-
-    /* 2 or 4 encoding per loop */
-    while (ip>istart)
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *--ip);
-
-        if (sizeof(size_t)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
-            FSE_flushBits(&bitC);
-
-        FSE_encodeSymbol(&bitC, &CState1, *--ip);
-
-        if (sizeof(size_t)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
-        {
-            FSE_encodeSymbol(&bitC, &CState2, *--ip);
-            FSE_encodeSymbol(&bitC, &CState1, *--ip);
-        }
-
-        FSE_flushBits(&bitC);
-    }
-
-    FSE_flushCState(&bitC, &CState2);
-    FSE_flushCState(&bitC, &CState1);
-    return FSE_closeCStream(&bitC);
-}
-
-
-size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
-
-
-size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
-{
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip = istart;
-
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart;
-    BYTE* const oend = ostart + dstSize;
-
-    U32   count[FSE_MAX_SYMBOL_VALUE+1];
-    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
-    CTable_max_t ct;
-    size_t errorCode;
-
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
-    if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
-    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
-
-    /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
-    if (FSE_isError(errorCode)) return errorCode;
-    if (errorCode == srcSize) return 1;
-    if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
-
-    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-    errorCode = FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
-    if (FSE_isError(errorCode)) return errorCode;
-
-    /* Write table description header */
-    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return errorCode;
-    op += errorCode;
-
-    /* Compress */
-    errorCode = FSE_buildCTable (ct, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return errorCode;
-    op += FSE_compress_usingCTable(op, oend - op, ip, srcSize, ct);
-
-    /* check compressibility */
-    if ( (size_t)(op-ostart) >= srcSize-1 )
-        return 0;
-
-    return op-ostart;
-}
-
-
-size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
-{
-    return FSE_compress2(dst, dstSize, src, (U32)srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
-}
-
-
-/*********************************************************
-*  Decompression (Byte symbols)
-*********************************************************/
-typedef struct
-{
-    U16  newState;
-    BYTE symbol;
-    BYTE nbBits;
-} FSE_decode_t;   /* size == U32 */
-
-
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
-{
-    U32* const base32 = (U32*)dt;
-    FSE_decode_t* const cell = (FSE_decode_t*)(base32 + 1);
-
-    base32[0] = 0;
-
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-
-    return 0;
-}
-
-
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
-{
-    U32* const base32 = (U32*)dt;
-    FSE_decode_t* dinfo = (FSE_decode_t*)(base32 + 1);
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
-
-    /* Build Decoding Table */
-    base32[0] = nbBits;
-    for (s=0; s<=maxSymbolValue; s++)
-    {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-
-    return 0;
-}
-
-
-/* FSE_initDStream
- * Initialize a FSE_DStream_t.
- * srcBuffer must point at the beginning of an FSE block.
- * The function result is the size of the FSE_block (== srcSize).
- * If srcSize is too small, the function will return an errorCode;
- */
-size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
-{
-    if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
-
-    if (srcSize >=  sizeof(size_t))
-    {
-        U32 contain32;
-        bitD->start = (const char*)srcBuffer;
-        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);
-        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
-        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
-        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
-    }
-    else
-    {
-        U32 contain32;
-        bitD->start = (const char*)srcBuffer;
-        bitD->ptr   = bitD->start;
-        bitD->bitContainer = *(const BYTE*)(bitD->start);
-        switch(srcSize)
-        {
-            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
-            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
-            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
-            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
-            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
-            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
-            default:;
-        }
-        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
-        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
-        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
-        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
-    }
-
-    return srcSize;
-}
-
-
-/* FSE_readBits
- * Read next n bits from the bitContainer.
- * On 32-bits, don't read more than maxNbBits==25
- * On 64-bits, don't read more than maxNbBits==57
- * Use the fast variant *only* if n >= 1.
- * return : value extracted.
- */
-size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
-{
-    size_t value = ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(size_t)*8)-1))) >> 1) >> (((sizeof(size_t)*8)-1)-nbBits);
-    bitD->bitsConsumed += nbBits;
-    return value;
-}
-
-size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
-{
-    size_t value = (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(size_t)*8)-nbBits);
-    bitD->bitsConsumed += nbBits;
-    return value;
-}
-
-unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
-{
-    if (bitD->ptr >= bitD->start + sizeof(size_t))
-    {
-        bitD->ptr -= bitD->bitsConsumed >> 3;
-        bitD->bitsConsumed &= 7;
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);
-        return 0;
-    }
-    if (bitD->ptr == bitD->start)
-    {
-        if (bitD->bitsConsumed < sizeof(size_t)*8) return 1;
-        if (bitD->bitsConsumed == sizeof(size_t)*8) return 2;
-        return 3;
-    }
-    {
-        U32 nbBytes = bitD->bitsConsumed >> 3;
-        if (bitD->ptr - nbBytes < bitD->start)
-            nbBytes = (U32)(bitD->ptr - bitD->start);  /* note : necessarily ptr > start */
-        bitD->ptr -= nbBytes;
-        bitD->bitsConsumed -= nbBytes*8;
-        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* note : necessarily srcSize > sizeof(bitD) */
-        return (bitD->ptr == bitD->start);
-    }
-}
-
-
-void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
-{
-    const U32* const base32 = (const U32*)dt;
-    DStatePtr->state = FSE_readBits(bitD, base32[0]);
-    FSE_reloadDStream(bitD);
-    DStatePtr->table = base32 + 1;
-}
-
-BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
-{
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32  nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    size_t lowBits = FSE_readBits(bitD, nbBits);
-
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-}
-
-BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
-{
-    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
-    const U32 nbBits = DInfo.nbBits;
-    BYTE symbol = DInfo.symbol;
-    size_t lowBits = FSE_readBitsFast(bitD, nbBits);
-
-    DStatePtr->state = DInfo.newState + lowBits;
-    return symbol;
-}
-
-/* FSE_endOfDStream
-   Tells if bitD has reached end of bitStream or not */
-
-unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
-{
-    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(size_t)*8));
-}
-
-unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
-{
-    return DStatePtr->state == 0;
-}
-
-
-FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
-          void* dst, size_t maxDstSize,
-    const void* cSrc, size_t cSrcSize,
-    const FSE_DTable* dt, unsigned fast)
-{
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart;
-    BYTE* const omax = op + maxDstSize;
-    BYTE* const olimit = omax-3;
-
-    FSE_DStream_t bitD;
-    FSE_DState_t state1;
-    FSE_DState_t state2;
-    size_t errorCode;
-
-    /* Init */
-    errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
-    if (FSE_isError(errorCode)) return errorCode;
-
-    FSE_initDState(&state1, &bitD, dt);
-    FSE_initDState(&state2, &bitD, dt);
-
-
-    /* 2 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op<olimit))
-    {
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if (FSE_MAX_TABLELOG*2+7 > sizeof(size_t)*8)    /* This test must be static */
-            FSE_reloadDStream(&bitD);
-
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-
-        if (FSE_MAX_TABLELOG*4+7 < sizeof(size_t)*8)    /* This test must be static */
-        {
-            *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
-    }
-
-    /* tail */
-    /* note : FSE_reloadDStream(&bitD) >= 1; Ends at exactly 2 */
-    while (1)
-    {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
-            break;
-
-        *op++ = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
-            break;
-
-        *op++ = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-    }
-
-    /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
-        return op-ostart;
-
-    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
-
-    return (size_t)-FSE_ERROR_corruptionDetected;
-}
-
-
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt, size_t fastMode)
-{
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
-
-
-size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
-{
-    const BYTE* const istart = (const BYTE*)cSrc;
-    const BYTE* ip = istart;
-    short counting[FSE_MAX_SYMBOL_VALUE+1];
-    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
-    unsigned tableLog;
-    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
-    size_t errorCode, fastMode;
-
-    if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
-
-    /* normal FSE decoding mode */
-    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
-    if (FSE_isError(errorCode)) return errorCode;
-    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
-    ip += errorCode;
-    cSrcSize -= errorCode;
-
-    fastMode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
-    if (FSE_isError(fastMode)) return fastMode;
-
-    /* always return, even if it is an error code */
-    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt, fastMode);
-}
-
-
-#endif   /* FSE_COMMONDEFS_ONLY */
-
-/*
-  2nd part of the file
   designed to be included
   for type-specific functions (template emulation in C)
   Objective is to write these functions only once, for improved maintenance
@@ -1456,7 +512,7 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
     const unsigned step = FSE_tableStep(tableSize);
     unsigned cumul[FSE_MAX_SYMBOL_VALUE+2];
     U32 position = 0;
-    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE] = {0};   /* should not be necessary, but analyzer complain without it, and performance loss is negligible with it */
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* init not necessary, but analyzer complain about it */
     U32 highThreshold = tableSize-1;
     unsigned symbol;
     unsigned i;
@@ -1499,8 +555,8 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
     /* Build table */
     for (i=0; i<tableSize; i++)
     {
-        FSE_FUNCTION_TYPE s = tableSymbol[i];
-        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* Table U16 : sorted by symbol order; gives next state value */
+        FSE_FUNCTION_TYPE s = tableSymbol[i];   /* static analyzer doesn't understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+i);   /* TableU16 : sorted by symbol order; gives next state value */
     }
 
     /* Build Symbol Transformation Table */
@@ -1511,20 +567,22 @@ size_t FSE_FUNCTION_NAME(FSE_buildCTable, FSE_FUNCTION_EXTENSION)
         {
             switch (normalizedCounter[s])
             {
-            case 0:
+            case  0:
                 break;
             case -1:
-            case 1:
-                symbolTT[s].minBitsOut = (BYTE)tableLog;
+            case  1:
+                symbolTT[s].deltaNbBits = tableLog << 16;
                 symbolTT[s].deltaFindState = total - 1;
                 total ++;
-                symbolTT[s].maxState = (U16)( (tableSize*2) - 1);   /* ensures state <= maxState */
                 break;
             default :
-                symbolTT[s].minBitsOut = (BYTE)( (tableLog-1) - FSE_highbit32 (normalizedCounter[s]-1) );
-                symbolTT[s].deltaFindState = total - normalizedCounter[s];
-                total +=  normalizedCounter[s];
-                symbolTT[s].maxState = (U16)( (normalizedCounter[s] << (symbolTT[s].minBitsOut+1)) - 1);
+                {
+                    U32 maxBitsOut = tableLog - FSE_highbit32 (normalizedCounter[s]-1);
+                    U32 minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+                }
             }
         }
     }
@@ -1546,12 +604,16 @@ void FSE_FUNCTION_NAME(FSE_freeDTable, FSE_FUNCTION_EXTENSION) (FSE_DTable* dt)
     free(dt);
 }
 
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
 
 size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
 (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
 {
-    U32* const base32 = (U32*)dt;
-    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (base32+1);
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (dt+1);   /* because dt is unsigned, 32-bits aligned on 32-bits */
     const U32 tableSize = 1 << tableLog;
     const U32 tableMask = tableSize-1;
     const U32 step = FSE_tableStep(tableSize);
@@ -1567,7 +629,7 @@ size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
     if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_tableLog_tooLarge;
 
     /* Init, lay down lowprob symbols */
-    base32[0] = tableLog;
+    DTableH[0].tableLog = (U16)tableLog;
     for (s=0; s<=maxSymbolValue; s++)
     {
         if (normalizedCounter[s]==-1)
@@ -1608,5 +670,1788 @@ size_t FSE_FUNCTION_NAME(FSE_buildDTable, FSE_FUNCTION_EXTENSION)
         }
     }
 
-    return noLarge;
+    DTableH->fastMode = (U16)noLarge;
+    return 0;
 }
+
+
+/******************************************
+*  FSE byte symbol
+******************************************/
+#ifndef FSE_COMMONDEFS_ONLY
+
+unsigned FSE_isError(size_t code) { return (code > (size_t)(-FSE_ERROR_maxCode)); }
+
+#define FSE_GENERATE_STRING(STRING) #STRING,
+static const char* FSE_errorStrings[] = { FSE_LIST_ERRORS(FSE_GENERATE_STRING) };
+
+const char* FSE_getErrorName(size_t code)
+{
+    static const char* codeError = "Unspecified error code";
+    if (FSE_isError(code)) return FSE_errorStrings[-(int)(code)];
+    return codeError;
+}
+
+static short FSE_abs(short a)
+{
+    return a<0? -a : a;
+}
+
+
+/****************************************************************
+*  Header bitstream management
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                                       unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    bitStream = 0;
+    bitCount  = 0;
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while (remaining>1)   /* stops at 1 */
+    {
+        if (previous0)
+        {
+            unsigned start = charnum;
+            while (!normalizedCounter[charnum]) charnum++;
+            while (charnum >= start+24)
+            {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (charnum >= start+3)
+            {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (charnum-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16)
+            {
+                if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+            }
+        }
+        {
+            short count = normalizedCounter[charnum++];
+            const short max = (short)((2*threshold-1)-remaining);
+            remaining -= FSE_abs(count);
+            if (remaining<1) return (size_t)-FSE_ERROR_GENERIC;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previous0 = (count==1);
+            while (remaining<threshold) nbBits--, threshold>>=1;
+        }
+        if (bitCount>16)
+        {
+            if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+        }
+    }
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2)) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    if (charnum > maxSymbolValue + 1) return (size_t)-FSE_ERROR_GENERIC;
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return (size_t)-FSE_ERROR_srcSize_wrong;
+    bitStream = FSE_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return (size_t)-FSE_ERROR_tableLog_tooLarge;
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) && (charnum<=*maxSVPtr))
+    {
+        if (previous0)
+        {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF)
+            {
+                n0+=24;
+                if (ip < iend-5)
+                {
+                    ip+=2;
+                    bitStream = FSE_readLE32(ip) >> bitCount;
+                }
+                else
+                {
+                    bitStream >>= 16;
+                    bitCount+=16;
+                }
+            }
+            while ((bitStream & 3) == 3)
+            {
+                n0+=3;
+                bitStream>>=2;
+                bitCount+=2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return (size_t)-FSE_ERROR_maxSymbolValue_tooSmall;
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+            {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = FSE_readLE32(ip) >> bitCount;
+            }
+            else
+                bitStream >>= 2;
+        }
+        {
+            const short max = (short)((2*threshold-1)-remaining);
+            short count;
+
+            if ((bitStream & (threshold-1)) < (U32)max)
+            {
+                count = (short)(bitStream & (threshold-1));
+                bitCount   += nbBits-1;
+            }
+            else
+            {
+                count = (short)(bitStream & (2*threshold-1));
+                if (count >= threshold) count -= max;
+                bitCount   += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= FSE_abs(count);
+            normalizedCounter[charnum++] = count;
+            previous0 = !count;
+            while (remaining < threshold)
+            {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            {
+                if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4))
+                {
+                    ip += bitCount>>3;
+                    bitCount &= 7;
+                }
+                else
+                {
+                    bitCount -= (int)(8 * (iend - 4 - ip));
+					ip = iend - 4;
+				}
+                bitStream = FSE_readLE32(ip) >> (bitCount & 31);
+            }
+        }
+    }
+    if (remaining != 1) return (size_t)-FSE_ERROR_GENERIC;
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    if ((size_t)(ip-istart) > hbSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    return ip-istart;
+}
+
+
+/****************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*
+FSE_CTable[0] is a variable size structure which contains :
+    U16 tableLog;
+    U16 maxSymbolValue;
+    U16 nextStateNumber[1 << tableLog];                         // This size is variable
+    FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];  // This size is variable
+Allocation is manual, since C standard does not support variable-size structures.
+*/
+
+size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    FSE_STATIC_ASSERT((size_t)FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)*4 >= sizeof(CTable_max_t));   /* A compilation error here means FSE_CTABLE_SIZE_U32 is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return size;
+}
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void  FSE_freeCTable (FSE_CTable* ct)
+{
+    free(ct);
+}
+
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 minBitsSrc = FSE_highbit32((U32)(srcSize - 1)) + 1;
+	U32 minBitsSymbols = FSE_highbit32(maxSymbolValue) + 2;
+	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+	return minBits;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 maxBitsSrc = FSE_highbit32((U32)(srcSize - 1)) - 2;
+    U32 tableLog = maxTableLog;
+	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+	if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+	if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        if (count[s] == 0)
+        {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold)
+        {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne)
+        {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        norm[s]=-2;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if ((total / ToDistribute) > lowOne)
+    {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++)
+        {
+            if ((norm[s] == -2) && (count[s] <= lowOne))
+            {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+            }
+        }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1)
+    {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC =0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) maxV=s, maxC=count[s];
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    {
+        U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++)
+        {
+            if (norm[s]==-2)
+            {
+                U64 end = tmpTotal + (count[s] * rStep);
+                U32 sStart = (U32)(tmpTotal >> vStepLog);
+                U32 sEnd = (U32)(end >> vStepLog);
+                U32 weight = sEnd - sStart;
+                if (weight < 1)
+                    return (size_t)-FSE_ERROR_GENERIC;
+                norm[s] = (short)weight;
+                tmpTotal = end;
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return (size_t)-FSE_ERROR_GENERIC;   /* Too small tableLog, compression potentially impossible */
+
+    {
+        U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++)
+        {
+            if (count[s] == total) return 0;
+            if (count[s] == 0)
+            {
+                normalizedCounter[s]=0;
+                continue;
+            }
+            if (count[s] <= lowThreshold)
+            {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            }
+            else
+            {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8)
+                {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP)
+                {
+                    largestP=proba;
+                    largest=s;
+                }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+            }
+        }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1))
+        {
+            /* corner case, need another normalization method */
+            size_t errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            printf("%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((((U32*)ct)+1) + (tableSize>>1));
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        symbolTT[s].deltaNbBits = nbBits << 16;
+        symbolTT[s].deltaFindState = s-1;
+    }
+
+    return 0;
+}
+
+
+/* fake FSE_CTable, for rle (100% always same symbol) input */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    U16* tableU16 = ( (U16*) ct) + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) ((U32*)ct + 2);
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    {
+        symbolTT[symbolValue].deltaNbBits = 0;
+        symbolTT[symbolValue].deltaFindState = 0;
+    }
+
+    return 0;
+}
+
+
+size_t FSE_initCStream(FSE_CStream_t* bitC, void* start, size_t maxSize)
+{
+    if (maxSize < sizeof(bitC->ptr)) return (size_t)-FSE_ERROR_dstSize_tooSmall;
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)start;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + maxSize - sizeof(bitC->ptr);
+    return 0;
+}
+
+void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const U32 tableLog = ( (const U16*) ct) [0];
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = ((const U16*) ct) + 2;
+    statePtr->symbolTT = (const FSE_symbolCompressionTransform*)((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+void FSE_addBitsFast(FSE_CStream_t* bitC, size_t value, unsigned nbBits)   /* only use if upper bits are clean 0 */
+{
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+void FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+    static const unsigned mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF };   /* up to 25 bits */
+    bitC->bitContainer |= (value & mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+void FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    FSE_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+void FSE_flushBitsFast(FSE_CStream_t* bitC)  /* only if dst buffer is large enough ( >= FSE_compressBound()) */
+{
+    size_t nbBytes = bitC->bitPos >> 3;
+    FSE_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+void FSE_flushBits(FSE_CStream_t* bitC)
+{
+    size_t nbBytes = bitC->bitPos >> 3;
+    FSE_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+void FSE_flushCState(FSE_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    FSE_addBits(bitC, statePtr->value, statePtr->stateLog);
+    FSE_flushBits(bitC);
+}
+
+
+size_t FSE_closeCStream(FSE_CStream_t* bitC)
+{
+    char* endPtr;
+
+    FSE_addBitsFast(bitC, 1, 1);
+    FSE_flushBits(bitC);
+
+    if (bitC->ptr >= bitC->endPtr)   /* too close to buffer's end */
+        return 0;   /* not compressible */
+
+    endPtr = bitC->ptr;
+    endPtr += bitC->bitPos > 0;
+
+    return (endPtr - bitC->startPtr);
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* ip;
+    const BYTE* const iend = istart + srcSize;
+
+    size_t errorCode;
+    FSE_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+
+    /* init */
+    errorCode = FSE_initCStream(&bitC, dst, dstSize);
+    if (FSE_isError(errorCode)) return 0;
+    FSE_initCState(&CState1, ct);
+    CState2 = CState1;
+
+    ip=iend;
+
+#define FSE_FLUSHBITS(s)  (fast ? FSE_flushBitsFast(s) : FSE_flushBits(s))
+
+    /* join to even */
+    if (srcSize & 1)
+    {
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* join to mod 4 */
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2))   /* test bit 2 */
+    {
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    for ( ; ip>istart ; )
+    {
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 )   /* this test must be static */
+        {
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return FSE_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    const unsigned fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* ip = istart;
+
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32   count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    CTable_max_t ct;
+    size_t errorCode;
+
+    /* init conditions */
+    if (srcSize <= 1) return 0;  /* Uncompressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode == srcSize) return 1;
+    if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    errorCode = FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* Write table description header */
+    errorCode = FSE_writeNCount (op, oend-op, norm, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+    op += errorCode;
+
+    /* Compress */
+    errorCode = FSE_buildCTable (ct, norm, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_compress_usingCTable(op, oend - op, ip, srcSize, ct);
+    if (errorCode == 0) return 0;   /* not enough space for compressed data */
+    op += errorCode;
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 )
+        return 0;
+
+    return op-ostart;
+}
+
+size_t FSE_compress (void* dst, size_t dstSize, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstSize, src, (U32)srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+/*********************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_decode_t* const cell = (FSE_decode_t*)(dt + 1);   /* because dt is unsigned */
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)dt;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)(dt + 1);   /* because dt is unsigned */
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return (size_t)-FSE_ERROR_GENERIC;             /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<=maxSymbolValue; s++)
+    {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+
+/* FSE_initDStream
+ * Initialize a FSE_DStream_t.
+ * srcBuffer must point at the beginning of an FSE block.
+ * The function result is the size of the FSE_block (== srcSize).
+ * If srcSize is too small, the function will return an errorCode;
+ */
+size_t FSE_initDStream(FSE_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    if (srcSize >=  sizeof(size_t))
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(size_t);
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+    }
+    else
+    {
+        U32 contain32;
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[6]) << (sizeof(size_t)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[5]) << (sizeof(size_t)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[4]) << (sizeof(size_t)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(bitD->start))[1]) <<  8;
+            default:;
+        }
+        contain32 = ((const BYTE*)srcBuffer)[srcSize-1];
+        if (contain32 == 0) return (size_t)-FSE_ERROR_GENERIC;   /* stop bit not present */
+        bitD->bitsConsumed = 8 - FSE_highbit32(contain32);
+        bitD->bitsConsumed += (U32)(sizeof(size_t) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+
+/* FSE_lookBits
+ * Provides next n bits from the bitContainer.
+ * bitContainer is not modified (bits are still present for next read/look)
+ * On 32-bits, maxNbBits==25
+ * On 64-bits, maxNbBits==57
+ * return : value extracted.
+ */
+static size_t FSE_lookBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    return ((bitD->bitContainer << (bitD->bitsConsumed & ((sizeof(bitD->bitContainer)*8)-1))) >> 1) >> (((sizeof(bitD->bitContainer)*8)-1)-nbBits);
+}
+
+static size_t FSE_lookBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    return (bitD->bitContainer << bitD->bitsConsumed) >> ((sizeof(bitD->bitContainer)*8)-nbBits);
+}
+
+static void FSE_skipBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+
+/* FSE_readBits
+ * Read next n bits from the bitContainer.
+ * On 32-bits, don't read more than maxNbBits==25
+ * On 64-bits, don't read more than maxNbBits==57
+ * Use the fast variant *only* if n >= 1.
+ * return : value extracted.
+ */
+size_t FSE_readBits(FSE_DStream_t* bitD, U32 nbBits)
+{
+    size_t value = FSE_lookBits(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+size_t FSE_readBitsFast(FSE_DStream_t* bitD, U32 nbBits)   /* only if nbBits >= 1 !! */
+{
+    size_t value = FSE_lookBitsFast(bitD, nbBits);
+    FSE_skipBits(bitD, nbBits);
+    return value;
+}
+
+unsigned FSE_reloadDStream(FSE_DStream_t* bitD)
+{
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should never happen */
+		return FSE_DStream_tooFar;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer))
+    {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);
+        return FSE_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start)
+    {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return FSE_DStream_endOfBuffer;
+        return FSE_DStream_completed;
+    }
+    {
+        U32 nbBytes = bitD->bitsConsumed >> 3;
+        U32 result = FSE_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start)
+        {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = FSE_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = FSE_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+
+void FSE_initDState(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)dt;
+    DStatePtr->state = FSE_readBits(bitD, DTableH->tableLog);
+    FSE_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32  nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD)
+{
+    const FSE_decode_t DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    const U32 nbBits = DInfo.nbBits;
+    BYTE symbol = DInfo.symbol;
+    size_t lowBits = FSE_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/* FSE_endOfDStream
+   Tells if bitD has reached end of bitStream or not */
+
+unsigned FSE_endOfDStream(const FSE_DStream_t* bitD)
+{
+    return ((bitD->ptr == bitD->start) && (bitD->bitsConsumed == sizeof(bitD->bitContainer)*8));
+}
+
+unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    FSE_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+    size_t errorCode;
+
+    /* Init */
+    errorCode = FSE_initDStream(&bitD, cSrc, cSrcSize);   /* replaced last arg by maxCompressed Size */
+    if (FSE_isError(errorCode)) return errorCode;
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (FSE_reloadDStream(&bitD)==FSE_DStream_unfinished) && (op<olimit) ; op+=4)
+    {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (FSE_reloadDStream(&bitD) > FSE_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            FSE_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : FSE_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly FSE_DStream_completed */
+    while (1)
+    {
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state1))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state1);
+
+        if ( (FSE_reloadDStream(&bitD)>FSE_DStream_completed) || (op==omax) || (FSE_endOfDStream(&bitD) && (fast || FSE_endOfDState(&state2))) )
+            break;
+
+        *op++ = FSE_GETSYMBOL(&state2);
+    }
+
+    /* end ? */
+    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2))
+        return op-ostart;
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)dt;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    size_t errorCode;
+
+    if (cSrcSize<2) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+
+    /* normal FSE decoding mode */
+    errorCode = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;   /* too small input size */
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    errorCode = FSE_buildDTable (dt, counting, maxSymbolValue, tableLog);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    /* always return, even if it is an error code */
+    return FSE_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, dt);
+}
+
+
+
+/*********************************************************
+*  Huff0 : Huffman block compression
+*********************************************************/
+#define HUF_MAX_SYMBOL_VALUE 255
+#define HUF_DEFAULT_TABLELOG  12       /* used by default, when not specified */
+#define HUF_MAX_TABLELOG  12           /* max possible tableLog; for allocation purpose; can be modified */
+#define HUF_ABSOLUTEMAX_TABLELOG  16   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_MAX_TABLELOG > HUF_ABSOLUTEMAX_TABLELOG)
+#  error "HUF_MAX_TABLELOG is too large !"
+#endif
+
+typedef struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+} HUF_CElt ;
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+/* HUF_writeCTable() :
+   return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* tree, U32 maxSymbolValue, U32 huffLog)
+{
+    BYTE bitsToWeight[HUF_ABSOLUTEMAX_TABLELOG + 1];
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 n;
+    BYTE* op = (BYTE*)dst;
+    size_t size;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE + 1)
+        return (size_t)-FSE_ERROR_GENERIC;
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<=huffLog; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[tree[n].nbBits];
+
+    size = FSE_compress(op+1, maxDstSize-1, huffWeight, maxSymbolValue);   /* don't need last symbol stat : implied */
+    if (FSE_isError(size)) return size;
+    if (size >= 128) return (size_t)-FSE_ERROR_GENERIC;   /* should never happen, since maxSymbolValue <= 255 */
+    if ((size <= 1) || (size >= maxSymbolValue/2))
+    {
+        if (size==1)   /* RLE */
+        {
+            /* only possible case : serie of 1 (because there are at least 2) */
+            /* can only be 2^n or (2^n-1), otherwise not an huffman tree */
+            BYTE code;
+            switch(maxSymbolValue)
+            {
+            case 1: code = 0; break;
+            case 2: code = 1; break;
+            case 3: code = 2; break;
+            case 4: code = 3; break;
+            case 7: code = 4; break;
+            case 8: code = 5; break;
+            case 15: code = 6; break;
+            case 16: code = 7; break;
+            case 31: code = 8; break;
+            case 32: code = 9; break;
+            case 63: code = 10; break;
+            case 64: code = 11; break;
+            case 127: code = 12; break;
+            case 128: code = 13; break;
+            default : return (size_t)-FSE_ERROR_corruptionDetected;
+            }
+            op[0] = (BYTE)(255-13 + code);
+            return 1;
+        }
+         /* Not compressible */
+        if (maxSymbolValue > (241-128)) return (size_t)-FSE_ERROR_GENERIC;   /* not implemented (not possible with current format) */
+        if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* not enough space within dst buffer */
+        op[0] = (BYTE)(128 /*special case*/ + 0 /* Not Compressible */ + (maxSymbolValue-1));
+		huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause issue in final combination */
+        for (n=0; n<maxSymbolValue; n+=2)
+            op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+        return ((maxSymbolValue+1)/2) + 1;
+    }
+
+    /* normal header case */
+    op[0] = (BYTE)size;
+    return size+1;
+}
+
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    int totalCost = 0;
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+
+    /* early exit : all is fine */
+    if (largestBits <= maxNbBits) return largestBits;
+
+    // now we have a few too large elements (at least >= 2)
+    {
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits)
+        {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost necessarily multiple of baseCost */
+
+        // repay cost
+        while (huffNode[n].nbBits == maxNbBits) n--;   // n at last of rank (maxNbBits-1)
+
+        {
+            const U32 noOne = 0xF0F0F0F0;
+            // Get pos of last (smallest) symbol per rank
+            U32 rankLast[HUF_MAX_TABLELOG];
+            U32 currentNbBits = maxNbBits;
+            int pos;
+			memset(rankLast, 0xF0, sizeof(rankLast));
+            for (pos=n ; pos >= 0; pos--)
+            {
+                if (huffNode[pos].nbBits >= currentNbBits) continue;
+                currentNbBits = huffNode[pos].nbBits;
+                rankLast[maxNbBits-currentNbBits] = pos;
+            }
+
+            while (totalCost > 0)
+            {
+                U32 nBitsToDecrease = FSE_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--)
+                {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noOne) continue;
+                    if (lowPos == noOne) break;
+                    {
+                        U32 highTotal = huffNode[highPos].count;
+                        U32 lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                    }
+                }
+                while (rankLast[nBitsToDecrease] == noOne)
+                    nBitsToDecrease ++;   // In some rare cases, no more rank 1 left => overshoot to closest
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noOne)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   // now there is one elt
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                rankLast[nBitsToDecrease]--;
+				if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    rankLast[nBitsToDecrease] = noOne;   // rank list emptied
+            }
+			while (totalCost < 0)   // Sometimes, cost correction overshoot
+			{
+				if (rankLast[1] == noOne)   /* special case, no weight 1, let's find it back at n */
+				{
+					while (huffNode[n].nbBits == maxNbBits) n--;
+					huffNode[n+1].nbBits--;
+					rankLast[1] = n+1;
+					totalCost++;
+					continue;
+				}
+				huffNode[ rankLast[1] + 1 ].nbBits--;
+				rankLast[1]++;
+				totalCost ++;
+			}
+        }
+    }
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++)
+    {
+        U32 r = FSE_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++)
+    {
+        U32 c = count[n];
+        U32 r = FSE_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) huffNode[pos]=huffNode[pos-1], pos--;
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+#define STARTNODE (HUF_MAX_SYMBOL_VALUE+1)
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    nodeElt huffNode0[2*HUF_MAX_SYMBOL_VALUE+1 +1];
+    nodeElt* huffNode = huffNode0 + 1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    /* safety checks */
+    if (maxNbBits == 0) maxNbBits = HUF_DEFAULT_TABLELOG;
+    if (maxSymbolValue > HUF_MAX_SYMBOL_VALUE) return (size_t)-FSE_ERROR_GENERIC;
+	memset(huffNode0, 0, sizeof(huffNode0));
+
+    // sort, decreasing order
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    // init for parents
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);
+
+    // create parents
+    while (nodeNb <= nodeRoot)
+    {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    // distribute weights (unlimited tree height)
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    // enforce maxTableLog
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    // fill result into tree (val, nbBits)
+    {
+        U16 nbPerRank[HUF_ABSOLUTEMAX_TABLELOG+1] = {0};
+        U16 valPerRank[HUF_ABSOLUTEMAX_TABLELOG+1];
+        if (maxNbBits > HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_GENERIC;   // check
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        {
+            // determine stating value per rank
+            U16 min = 0;
+            for (n=maxNbBits; n>0; n--)
+            {
+                valPerRank[n] = min;      // get starting value within each rank
+                min += nbPerRank[n];
+                min >>= 1;
+            }
+        }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   // push nbBits per symbol, symbol order
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   // assign value within rank, symbol order
+    }
+
+    return maxNbBits;
+}
+
+static void HUF_encodeSymbol(FSE_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    FSE_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define FSE_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*2+7) FSE_FLUSHBITS(stream)
+
+#define FSE_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_MAX_TABLELOG*4+7) FSE_FLUSHBITS(stream)
+
+size_t HUF_compress_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = (BYTE*) ostart;
+    BYTE* const oend = ostart + dstSize;
+    U16* jumpTable = (U16*) dst;
+    size_t n, streamSize;
+    const unsigned fast = (dstSize >= HUF_BLOCKBOUND(srcSize));
+    size_t errorCode;
+    FSE_CStream_t bitC;
+
+    /* init */
+	if (dstSize < 8) return 0;
+    op += 6;   /* jump Table -- could be optimized by delta / deviation */
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
+
+    n = srcSize & ~15;  // mod 16
+    switch (srcSize & 15)
+    {
+        case 15: HUF_encodeSymbol(&bitC, ip[n+14], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 14: HUF_encodeSymbol(&bitC, ip[n+13], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 13: HUF_encodeSymbol(&bitC, ip[n+12], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 12: HUF_encodeSymbol(&bitC, ip[n+11], CTable);
+                 FSE_FLUSHBITS(&bitC);
+        case 11: HUF_encodeSymbol(&bitC, ip[n+10], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 10: HUF_encodeSymbol(&bitC, ip[n+ 9], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 9 : HUF_encodeSymbol(&bitC, ip[n+ 8], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 8 : HUF_encodeSymbol(&bitC, ip[n+ 7], CTable);
+                 FSE_FLUSHBITS(&bitC);
+        case 7 : HUF_encodeSymbol(&bitC, ip[n+ 6], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 6 : HUF_encodeSymbol(&bitC, ip[n+ 5], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 5 : HUF_encodeSymbol(&bitC, ip[n+ 4], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 4 : HUF_encodeSymbol(&bitC, ip[n+ 3], CTable);
+                 FSE_FLUSHBITS(&bitC);
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 FSE_FLUSHBITS_2(&bitC);
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 FSE_FLUSHBITS_1(&bitC);
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 FSE_FLUSHBITS(&bitC);
+        case 0 :
+        default: ;
+    }
+
+    for (; n>0; n-=16)
+    {
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 8], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-12], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-16], CTable);
+        FSE_FLUSHBITS(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
+    FSE_writeLE16(jumpTable, (U16)streamSize);
+    op += streamSize;
+
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 7], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-11], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-15], CTable);
+        FSE_FLUSHBITS(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
+    FSE_writeLE16(jumpTable+1, (U16)streamSize);
+    op += streamSize;
+
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 6], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-10], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-14], CTable);
+        FSE_FLUSHBITS(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
+    FSE_writeLE16(jumpTable+2, (U16)streamSize);
+    op += streamSize;
+
+    errorCode = FSE_initCStream(&bitC, op, oend-op);
+    if (FSE_isError(errorCode)) return 0;
+    n = srcSize & ~15;  // mod 16
+    for (; n>0; n-=16)
+    {
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 5], CTable);
+        FSE_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 9], CTable);
+        FSE_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n-13], CTable);
+        FSE_FLUSHBITS(&bitC);
+    }
+    streamSize = FSE_closeCStream(&bitC);
+    if (streamSize==0) return 0;   /* not enough space within dst buffer == uncompressible */
+    op += streamSize;
+
+    return op-ostart;
+}
+
+
+size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 count[HUF_MAX_SYMBOL_VALUE+1];
+    HUF_CElt CTable[HUF_MAX_SYMBOL_VALUE+1];
+    size_t errorCode;
+
+    /* early out */
+    if (srcSize <= 1) return srcSize;  /* Uncompressed or RLE */
+    if (!maxSymbolValue) maxSymbolValue = HUF_MAX_SYMBOL_VALUE;
+    if (!huffLog) huffLog = HUF_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    errorCode = FSE_count (count, &maxSymbolValue, (const BYTE*)src, srcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode == srcSize) return 1;
+    if (errorCode < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+
+    /* Build Huffman Tree */
+    errorCode = HUF_buildCTable (CTable, count, maxSymbolValue, huffLog);
+    if (FSE_isError(errorCode)) return errorCode;
+    huffLog = (U32)errorCode;
+
+    /* Write table description header */
+    errorCode = HUF_writeCTable (op, dstSize, CTable, maxSymbolValue, huffLog);  /* don't write last symbol, implied */
+    if (FSE_isError(errorCode)) return errorCode;
+    op += errorCode;
+
+    /* Compress */
+    errorCode = HUF_compress_usingCTable(op, oend - op, src, srcSize, CTable);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode==0) return 0;
+    op += errorCode;
+
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1)
+        return op-ostart;
+
+    return op-ostart;
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_DEFAULT_TABLELOG);
+}
+
+
+/*********************************************************
+*  Huff0 : Huffman block decompression
+*********************************************************/
+typedef struct {
+    BYTE byte;
+    BYTE nbBits;
+} HUF_DElt;
+
+size_t HUF_readDTable (U16* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_MAX_SYMBOL_VALUE + 1];
+    U32 rankVal[HUF_ABSOLUTEMAX_TABLELOG + 1];  /* large enough for values from 0 to 16 */
+    U32 weightTotal;
+    U32 maxBits;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize = ip[0];
+    size_t oSize;
+    U32 n;
+    U32 nextRankStart;
+    HUF_DElt* const dt = (HUF_DElt*)(DTable + 1);
+
+    FSE_STATIC_ASSERT(sizeof(HUF_DElt) == sizeof(U16));   /* if compilation fails here, assertion is false */
+    //memset(huffWeight, 0, sizeof(huffWeight));   /* should not be necessary, but some analyzer complain ... */
+    if (iSize >= 128)  /* special header */
+    {
+        if (iSize >= (242))   /* RLE */
+        {
+            static int l[14] = { 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128 };
+            oSize = l[iSize-242];
+            memset(huffWeight, 1, oSize);
+            iSize = 0;
+        }
+        else   /* Incompressible */
+        {
+            oSize = iSize - 127;
+            iSize = ((oSize+1)/2);
+            if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+            ip += 1;
+            for (n=0; n<oSize; n+=2)
+            {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+            }
+        }
+    }
+    else  /* header compressed with FSE (normal case) */
+    {
+        if (iSize+1 > srcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+        oSize = FSE_decompress(huffWeight, HUF_MAX_SYMBOL_VALUE, ip+1, iSize);   /* max 255 values decoded, last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankVal, 0, sizeof(rankVal));
+    weightTotal = 0;
+    for (n=0; n<oSize; n++)
+    {
+        if (huffWeight[n] >= HUF_ABSOLUTEMAX_TABLELOG) return (size_t)-FSE_ERROR_corruptionDetected;
+        rankVal[huffWeight[n]]++;
+        weightTotal += (1 << huffWeight[n]) >> 1;
+    }
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    maxBits = FSE_highbit32(weightTotal) + 1;
+    if (maxBits > DTable[0]) return (size_t)-FSE_ERROR_tableLog_tooLarge;   /* DTable is too small */
+    DTable[0] = (U16)maxBits;
+    {
+        U32 total = 1 << maxBits;
+        U32 rest = total - weightTotal;
+        U32 verif = 1 << FSE_highbit32(rest);
+        U32 lastWeight = FSE_highbit32(rest) + 1;
+        if (verif != rest) return (size_t)-FSE_ERROR_corruptionDetected;    /* last value must be a clean power of 2 */
+        huffWeight[oSize] = (BYTE)lastWeight;
+        rankVal[lastWeight]++;
+    }
+
+    /* check tree construction validity */
+    if ((rankVal[1] < 2) || (rankVal[1] & 1)) return (size_t)-FSE_ERROR_corruptionDetected;   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* Prepare ranks */
+    nextRankStart = 0;
+    for (n=1; n<=maxBits; n++)
+    {
+        U32 current = nextRankStart;
+        nextRankStart += (rankVal[n] << (n-1));
+        rankVal[n] = current;
+    }
+
+    /* fill DTable */
+    for (n=0; n<=oSize; n++)
+    {
+        const U32 w = huffWeight[n];
+        const U32 length = (1 << w) >> 1;
+        U32 i;
+        HUF_DElt D;
+        D.byte = (BYTE)n; D.nbBits = (BYTE)(maxBits + 1 - w);
+        for (i = rankVal[w]; i < rankVal[w] + length; i++)
+            dt[i] = D;
+        rankVal[w] += length;
+    }
+
+    return iSize+1;
+}
+
+
+static BYTE HUF_decodeSymbol(FSE_DStream_t* Dstream, const HUF_DElt* dt, const U32 dtLog)
+{
+        const size_t val = FSE_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+        const BYTE c = dt[val].byte;
+        FSE_skipBits(Dstream, dt[val].nbBits);
+        return c;
+}
+
+static size_t HUF_decompress_usingDTable(   /* -3% slower when non static */
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const U16* DTable)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-15;
+
+    const HUF_DElt* const dt = (const HUF_DElt*)(DTable+1);
+    const U32 dtLog = DTable[0];
+    size_t errorCode;
+    U32 reloadStatus;
+
+    /* Init */
+
+    const U16* jumpTable = (const U16*)cSrc;
+    const size_t length1 = FSE_readLE16(jumpTable);
+    const size_t length2 = FSE_readLE16(jumpTable+1);
+    const size_t length3 = FSE_readLE16(jumpTable+2);
+    const size_t length4 = cSrcSize - 6 - length1 - length2 - length3;   // check coherency !!
+    const char* const start1 = (const char*)(cSrc) + 6;
+    const char* const start2 = start1 + length1;
+    const char* const start3 = start2 + length2;
+    const char* const start4 = start3 + length3;
+    FSE_DStream_t bitD1, bitD2, bitD3, bitD4;
+
+    if (length1+length2+length3+6 >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+
+    errorCode = FSE_initDStream(&bitD1, start1, length1);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD2, start2, length2);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD3, start3, length3);
+    if (FSE_isError(errorCode)) return errorCode;
+    errorCode = FSE_initDStream(&bitD4, start4, length4);
+    if (FSE_isError(errorCode)) return errorCode;
+
+    reloadStatus=FSE_reloadDStream(&bitD2);
+
+    /* 16 symbols per loop */
+    for ( ; (reloadStatus<FSE_DStream_completed) && (op<olimit);  /* D2-3-4 are supposed to be synchronized and finish together */
+        op+=16, reloadStatus = FSE_reloadDStream(&bitD2) | FSE_reloadDStream(&bitD3) | FSE_reloadDStream(&bitD4), FSE_reloadDStream(&bitD1))
+    {
+#define HUF_DECODE_SYMBOL_0(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog);
+
+#define HUF_DECODE_SYMBOL_1(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits() && (HUF_MAX_TABLELOG>12)) FSE_reloadDStream(&Dstream)
+
+#define HUF_DECODE_SYMBOL_2(n, Dstream) \
+        op[n] = HUF_decodeSymbol(&Dstream, dt, dtLog); \
+        if (FSE_32bits()) FSE_reloadDStream(&Dstream)
+
+        HUF_DECODE_SYMBOL_1( 0, bitD1);
+        HUF_DECODE_SYMBOL_1( 1, bitD2);
+        HUF_DECODE_SYMBOL_1( 2, bitD3);
+        HUF_DECODE_SYMBOL_1( 3, bitD4);
+        HUF_DECODE_SYMBOL_2( 4, bitD1);
+        HUF_DECODE_SYMBOL_2( 5, bitD2);
+        HUF_DECODE_SYMBOL_2( 6, bitD3);
+        HUF_DECODE_SYMBOL_2( 7, bitD4);
+        HUF_DECODE_SYMBOL_1( 8, bitD1);
+        HUF_DECODE_SYMBOL_1( 9, bitD2);
+        HUF_DECODE_SYMBOL_1(10, bitD3);
+        HUF_DECODE_SYMBOL_1(11, bitD4);
+        HUF_DECODE_SYMBOL_0(12, bitD1);
+        HUF_DECODE_SYMBOL_0(13, bitD2);
+        HUF_DECODE_SYMBOL_0(14, bitD3);
+        HUF_DECODE_SYMBOL_0(15, bitD4);
+    }
+
+    if (reloadStatus!=FSE_DStream_completed)   /* not complete : some bitStream might be FSE_DStream_unfinished */
+        return (size_t)-FSE_ERROR_corruptionDetected;
+
+    /* tail */
+    {
+        // bitTail = bitD1;   // *much* slower : -20% !??!
+        FSE_DStream_t bitTail;
+        bitTail.ptr = bitD1.ptr;
+        bitTail.bitsConsumed = bitD1.bitsConsumed;
+        bitTail.bitContainer = bitD1.bitContainer;   // required in case of FSE_DStream_endOfBuffer
+        bitTail.start = start1;
+        for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op<omax) ; op++)
+        {
+            HUF_DECODE_SYMBOL_0(0, bitTail);
+        }
+
+        if (FSE_endOfDStream(&bitTail))
+            return op-ostart;
+    }
+
+    if (op==omax) return (size_t)-FSE_ERROR_dstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
+
+    return (size_t)-FSE_ERROR_corruptionDetected;
+}
+
+
+size_t HUF_decompress (void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLE(DTable, HUF_MAX_TABLELOG);
+    const BYTE* ip = (const BYTE*) cSrc;
+    size_t errorCode;
+
+    errorCode = HUF_readDTable (DTable, cSrc, cSrcSize);
+    if (FSE_isError(errorCode)) return errorCode;
+    if (errorCode >= cSrcSize) return (size_t)-FSE_ERROR_srcSize_wrong;
+    ip += errorCode;
+    cSrcSize -= errorCode;
+
+    return HUF_decompress_usingDTable (dst, maxDstSize, ip, cSrcSize, DTable);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/fse.h b/lib/fse.h
index df95d258..36d86dcd 100644
--- a/lib/fse.h
+++ b/lib/fse.h
@@ -55,12 +55,11 @@ size_t FSE_decompress(void* dst,  size_t maxDstSize,
 /*
 FSE_compress():
     Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated, and sized to handle worst case situations.
-    Worst case size evaluation is provided by FSE_compressBound().
-    return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
+    'dst' buffer must be already allocated. Compression runs faster is maxDstSize >= FSE_compressBound(srcSize)
+    return : size of compressed data (<= maxDstSize)
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
 
 FSE_decompress():
     Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
@@ -70,7 +69,33 @@ FSE_decompress():
 
     ** Important ** : FSE_decompress() doesn't decompress non-compressible nor RLE data !!!
     Why ? : making this distinction requires a header.
-    FSE library doesn't manage headers, which are intentionally left to the user layer.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+
+
+/******************************************
+*  Huff0 simple functions
+******************************************/
+size_t HUF_compress(void* dst, size_t maxDstSize,
+              const void* src, size_t srcSize);
+size_t HUF_decompress(void* dst,  size_t maxDstSize,
+                const void* cSrc, size_t cSrcSize);
+/*
+HUF_compress():
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is maxDstSize >= HUF_compressBound(srcSize)
+    return : size of compressed data (<= maxDstSize)
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+
+HUF_decompress():
+    Decompress Huff0 data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'maxDstSize'.
+    return : size of regenerated data (<= maxDstSize)
+             or an error code, which can be tested using FSE_isError()
+
+    ** Important ** : HUF_decompress() doesn't decompress non-compressible nor RLE data !!!
 */
 
 
@@ -98,6 +123,8 @@ FSE_compress2():
 */
 size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
 
+size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
 
 /******************************************
 *  FSE detailed API
@@ -106,18 +133,18 @@ size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize
 FSE_compress() does the following:
 1. count symbol occurrence from source[] into table count[]
 2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
-3. save normalized counters to memory buffer using writeHeader()
+3. save normalized counters to memory buffer using writeNCount()
 4. build encoding table 'CTable' from normalized counters
 5. encode the data stream using encoding table 'CTable'
 
 FSE_decompress() does the following:
-1. read normalized counters with readHeader()
+1. read normalized counters with readNCount()
 2. build decoding table 'DTable' from normalized counters
 3. decode the data stream using decoding table 'DTable'
 
-The following API allows to trigger specific sub-functions for advanced tasks.
+The following API allows targeting specific sub-functions for advanced tasks.
 For example, it's possible to compress several blocks using the same 'CTable',
-or to save and provide normalized distribution using one's own method.
+or to save and provide normalized distribution using external method.
 */
 
 /* *** COMPRESSION *** */
@@ -163,8 +190,8 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized
 
 /*
 Constructor and Destructor of type FSE_CTable
-Not that its size depends on parameters 'tableLog' and 'maxSymbolValue' */
-typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+    Note that its size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 FSE_CTable* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
 void        FSE_freeCTable (FSE_CTable* ct);
 
@@ -173,30 +200,32 @@ FSE_buildCTable():
    Builds 'ct', which must be already allocated, using FSE_createCTable()
    return : 0
             or an errorCode, which can be tested using FSE_isError() */
-size_t   FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
 
 /*
 FSE_compress_usingCTable():
    Compress 'src' using 'ct' into 'dst' which must be already allocated
-   return : size of compressed data
+   return : size of compressed data (<= maxDstSize)
+            or 0 if compressed data could not fit into 'dst'
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_compress_usingCTable (void* dst, size_t dstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
+size_t FSE_compress_usingCTable (void* dst, size_t maxDstSize, const void* src, size_t srcSize, const FSE_CTable* ct);
 
 /*
 Tutorial :
 ----------
-The first step is to count all symbols. FSE_count() provides one quick way to do this job.
+The first step is to count all symbols. FSE_count() does this job very fast.
 Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
 'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
 maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
 FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 
 The next step is to normalize the frequencies.
 FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
-It also guarantees a minimum of 1 to any Symbol which frequency is >= 1.
-You can use input 'tableLog'==0 to mean "use default tableLog value".
-If you are unsure of which tableLog value to use, you can optionally call FSE_optimalTableLog(),
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
 which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
 
 The result of FSE_normalizeCount() will be saved into a table,
@@ -204,23 +233,23 @@ called 'normalizedCounter', which is a table of signed short.
 'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
 The return value is tableLog if everything proceeded as expected.
 It is 0 if there is a single symbol within distribution.
-If there is an error(typically, invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
 
-'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeHeader().
-'header' buffer must be already allocated.
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
 For guaranteed success, buffer size must be at least FSE_headerBound().
-The result of the function is the number of bytes written into 'header'.
-If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()) (for example, buffer size too small).
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
 
 'normalizedCounter' can then be used to create the compression table 'CTable'.
-The space required by 'CTable' must be already allocated. Its size is provided by FSE_sizeof_CTable().
-'CTable' must be aligned of 4 bytes boundaries.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
 You can then use FSE_buildCTable() to fill 'CTable'.
-In both cases, if there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
 
 'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
 Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
-The function returns the size of compressed data (without header).
+The function returns the size of compressed data (without header), necessarily <= maxDstSize.
+If it returns '0', compressed data could not fit into 'dst'.
 If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
 */
 
@@ -237,26 +266,25 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, un
 
 /*
 Constructor and Destructor of type FSE_DTable
-Note that its size depends on parameters 'tableLog' */
-typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void */
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
 FSE_DTable* FSE_createDTable(unsigned tableLog);
 void        FSE_freeDTable(FSE_DTable* dt);
 
 /*
 FSE_buildDTable():
    Builds 'dt', which must be already allocated, using FSE_createDTable()
-   return : 1 if 'dt' is compatible with fast mode, 0 otherwise,
+   return : 0,
             or an errorCode, which can be tested using FSE_isError() */
 size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
 
 /*
 FSE_decompress_usingDTable():
-   Decompress compressed source 'cSrc' of size 'cSrcSize'
-   using 'dt' into 'dst' which must be already allocated.
-   Use fastMode==1 only if authorized by result of FSE_buildDTable().
+   Decompress compressed source 'cSrc' of size 'cSrcSize' using 'dt'
+   into 'dst' which must be already allocated.
    return : size of regenerated data (necessarily <= maxDstSize)
             or an errorCode, which can be tested using FSE_isError() */
-size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt, size_t fastMode);
+size_t FSE_decompress_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*
 Tutorial :
@@ -266,26 +294,24 @@ Tutorial :
  If block is a single repeated byte, use memset() instead )
 
 The first step is to obtain the normalized frequencies of symbols.
-This can be performed by reading a header with FSE_readHeader().
-'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of short.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
 In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
 or size the table to handle worst case situations (typically 256).
-FSE_readHeader will provide 'tableLog' and 'maxSymbolValue' stored into the header.
-The result of FSE_readHeader() is the number of bytes read from 'header'.
-Note that 'headerSize' must be at least 4 bytes, even if useful information is less than that.
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
-The next step is to create the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
 This is performed by the function FSE_buildDTable().
 The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
-The function will return 1 if FSE_DTable is compatible with fastMode, 0 otherwise.
 If there is an error, the function will return an error code, which can be tested using FSE_isError().
 
 'FSE_DTable' can then be used to decompress 'cSrc', with FSE_decompress_usingDTable().
-Only trigger fastMode if it was authorized by the result of FSE_buildDTable(), otherwise decompression will fail.
-cSrcSize must be correct, otherwise decompression will fail.
-FSE_decompress_usingDTable() result will tell how many bytes were regenerated.
-If there is an error, the function will return an error code, which can be tested using FSE_isError().
+'cSrcSize' must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=maxDstSize).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
 */
 
 
diff --git a/lib/fse_static.h b/lib/fse_static.h
index f085d1f5..84e704c6 100644
--- a/lib/fse_static.h
+++ b/lib/fse_static.h
@@ -48,12 +48,25 @@ extern "C" {
 /******************************************
 *  Static allocation
 ******************************************/
-#define FSE_MAX_HEADERSIZE 512
-#define FSE_COMPRESSBOUND(size) (size + (size>>7) + FSE_MAX_HEADERSIZE)   /* Macro can be useful for static allocation */
-/* You can statically allocate a CTable as a table of unsigned using below macro */
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate FSE CTable/DTable as a table of unsigned using below macro */
 #define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
 #define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
 
+/* Huff0 buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* You can statically allocate Huff0 DTable as a table of unsigned short using below macro */
+#define HUF_DTABLE_SIZE_U16(maxTableLog)   (1 + (1<<maxTableLog))
+#define HUF_CREATE_STATIC_DTABLE(DTable, maxTableLog) \
+        unsigned short DTable[HUF_DTABLE_SIZE_U16(maxTableLog)] = { maxTableLog }
+
 
 /******************************************
 *  Error Management
@@ -96,6 +109,7 @@ size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
    You will want to enable link-time-optimization to ensure these functions are properly inlined in your binary.
    Visual seems to do it automatically.
    For gcc or clang, you'll need to add -flto flag at compilation and linking stages.
+   If none of these solutions is applicable, include "fse.c" directly.
 */
 
 typedef struct
@@ -104,6 +118,7 @@ typedef struct
     int    bitPos;
     char*  startPtr;
     char*  ptr;
+    char*  endPtr;
 } FSE_CStream_t;
 
 typedef struct
@@ -114,10 +129,10 @@ typedef struct
     unsigned    stateLog;
 } FSE_CState_t;
 
-void   FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer);
+size_t FSE_initCStream(FSE_CStream_t* bitC, void* dstBuffer, size_t maxDstSize);
 void   FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
 
-void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned char symbol);
+void   FSE_encodeSymbol(FSE_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
 void   FSE_addBits(FSE_CStream_t* bitC, size_t value, unsigned nbBits);
 void   FSE_flushBits(FSE_CStream_t* bitC);
 
@@ -133,17 +148,18 @@ So the first symbol you will encode is the last you will decode, like a LIFO sta
 
 You will need a few variables to track your CStream. They are :
 
-FSE_CTable ct;        // Provided by FSE_buildCTable()
-FSE_CStream_t bitC;   // bitStream tracking structure
-FSE_CState_t state;   // State tracking structure (can have several)
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+FSE_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
 
 
 The first thing to do is to init bitStream and state.
-    FSE_initCStream(&bitC, dstBuffer);
+    size_t errorCode = FSE_initCStream(&bitStream, dstBuffer, maxDstSize);
     FSE_initCState(&state, ct);
 
+Note that FSE_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
 You can then encode your input data, byte after byte.
-FSE_encodeByte() outputs a maximum of 'tableLog' bits at a time.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
 Remember decoding will be done in reverse direction.
     FSE_encodeByte(&bitStream, &state, symbol);
 
@@ -159,8 +175,9 @@ Writing data to memory is a manual operation, performed by the flushBits functio
 Your last FSE encoding operation shall be to flush your last state value(s).
     FSE_flushState(&bitStream, &state);
 
-Finally, you must then close the bitStream.
-The function returns the size in bytes of CStream.
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
 If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
     size_t size = FSE_closeCStream(&bitStream);
 */
@@ -194,6 +211,12 @@ unsigned int  FSE_reloadDStream(FSE_DStream_t* bitD);
 unsigned FSE_endOfDStream(const FSE_DStream_t* bitD);
 unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
 
+typedef enum { FSE_DStream_unfinished = 0,
+               FSE_DStream_endOfBuffer = 1,
+               FSE_DStream_completed = 2,
+               FSE_DStream_tooFar = 3 } FSE_DStream_status;  /* result of FSE_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... ?! */
+
 /*
 Let's now decompose FSE_decompress_usingDTable() into its unitary components.
 You will decode FSE-encoded symbols from the bitStream,
@@ -201,16 +224,16 @@ and also any other bitFields you put in, **in reverse order**.
 
 You will need a few variables to track your bitStream. They are :
 
-FSE_DStream_t DStream;  // Stream context
-FSE_DState_t DState;    // State context. Multiple ones are possible
-FSE_DTable dt;          // Decoding table, provided by FSE_buildDTable()
-U32 tableLog;           // Provided by FSE_readHeader()
+FSE_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
 
 The first thing to do is to init the bitStream.
-    errorCode = FSE_initDStream(&DStream, &optionalId, srcBuffer, srcSize);
+    errorCode = FSE_initDStream(&DStream, srcBuffer, srcSize);
 
-You should then retrieve your initial state(s) :
-    errorCode = FSE_initDState(&DState, &DStream, dt, tableLog);
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
 
 You can then decode your data, symbol after symbol.
 For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
@@ -218,28 +241,28 @@ Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last
     unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
 
 You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
-Note : maximum allowed nbBits is 25
-    unsigned int bitField = FSE_readBits(&DStream, nbBits);
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = FSE_readBits(&DStream, nbBits);
 
-All above operations only read from local register (which size is controlled by bitD_t==32 bits).
+All above operations only read from local register (which size depends on size_t).
 Refueling the register from memory is manually performed by the reload method.
     endSignal = FSE_reloadDStream(&DStream);
 
 FSE_reloadDStream() result tells if there is still some more data to read from DStream.
-0 : there is still some data left into the DStream.
-1 : Dstream reached end of buffer, but is not yet fully extracted. It will not load data from memory any more.
-2 : Dstream reached its exact end, corresponding in general to decompression completed.
-3 : Dstream went too far. Decompression result is corrupted.
+FSE_DStream_unfinished : there is still some data left into the DStream.
+FSE_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+FSE_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+FSE_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
 
-When reaching end of buffer(1), progress slowly, notably if you decode multiple symbols per loop,
+When reaching end of buffer (FSE_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
 to properly detect the exact end of stream.
 After each decoded symbol, check if DStream is fully consumed using this simple test :
-    FSE_reloadDStream(&DStream) >= 2
+    FSE_reloadDStream(&DStream) >= FSE_DStream_completed
 
 When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
 Checking if DStream has reached its end is performed by :
     FSE_endOfDStream(&DStream);
-Check also the states. There might be some entropy left there, able to decode some high probability (>50%) symbol.
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
     FSE_endOfDState(&DState);
 */
 
@@ -251,7 +274,7 @@ size_t FSE_readBitsFast(FSE_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
 
 unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, FSE_DStream_t* bitD);
-/* faster, but works only if nbBits >= 1 (otherwise, result will be corrupted) */
+/* faster, but works only if allways nbBits >= 1 (otherwise, result will be corrupted) */
 
 
 #if defined (__cplusplus)
diff --git a/lib/zstd.c b/lib/zstd.c
index 5e4b2fa5..bcc16c87 100644
--- a/lib/zstd.c
+++ b/lib/zstd.c
@@ -124,7 +124,7 @@ typedef unsigned long long  U64;
 /********************************************************
 *  Constants
 *********************************************************/
-static const U32 ZSTD_magicNumber = 0xFD2FB51C;   /* Initial (limited) frame format */
+static const U32 ZSTD_magicNumber = 0xFD2FB51E;   /* 3rd version : seqNb header */
 
 #define HASH_LOG (ZSTD_MEMORY_USAGE - 2)
 #define HASH_TABLESIZE (1 << HASH_LOG)
@@ -158,6 +158,8 @@ static const U32 g_searchStrength = 8;
 #define MLFSELog   10
 #define LLFSELog   10
 #define OffFSELog   9
+#define MAX(a,b) ((a)<(b)?(b):(a))
+#define MaxSeq MAX(MaxLL, MaxML)
 
 #define LITERAL_NOENTROPY 63
 #define COMMAND_NOENTROPY 7   /* to remove */
@@ -200,6 +202,27 @@ static void ZSTD_wildcopy(void* dst, const void* src, size_t length)
     while (op < oend) COPY8(op, ip);
 }
 
+static U16 ZSTD_readLE16(const void* memPtr)
+{
+    if (ZSTD_isLittleEndian()) return ZSTD_read16(memPtr);
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + ((U16)p[1]<<8));
+    }
+}
+
+static void ZSTD_writeLE16(void* memPtr, U16 val)
+{
+    if (ZSTD_isLittleEndian()) memcpy(memPtr, &val, sizeof(val));
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
 static U32 ZSTD_readLE32(const void* memPtr)
 {
     if (ZSTD_isLittleEndian())
@@ -242,40 +265,6 @@ static void ZSTD_writeBE32(void* memPtr, U32 value)
     p[3] = (BYTE)(value>>0);
 }
 
-static size_t ZSTD_writeProgressive(void* ptr, size_t value)
-{
-    BYTE* const bStart = (BYTE* const)ptr;
-    BYTE* byte = bStart;
-
-    do
-    {
-        BYTE l = value & 127;
-        value >>= 7;
-        if (value) l += 128;
-        *byte++ = l;
-    } while (value);
-
-    return byte - bStart;
-}
-
-
-static size_t ZSTD_readProgressive(size_t* result, const void* ptr)
-{
-    const BYTE* const bStart = (const BYTE* const)ptr;
-    const BYTE* byte = bStart;
-    size_t r = 0;
-    U32 shift = 0;
-
-    do
-    {
-        r += (*byte & 127) << shift;
-        shift += 7;
-    } while (*byte++ & 128);
-
-    *result = r;
-    return byte - bStart;
-}
-
 
 /**************************************
 *  Local structures
@@ -333,6 +322,11 @@ ZSTD_Cctx* ZSTD_createCCtx(void)
     ZSTD_Cctx* ctx = (ZSTD_Cctx*) malloc( sizeof(ZSTD_Cctx) );
     if (ctx==NULL) return NULL;
     ctx->seqStore.buffer = malloc(WORKPLACESIZE);
+    if (ctx->seqStore.buffer==NULL)
+    {
+        free(ctx);
+        return NULL;
+    }
     ctx->seqStore.offsetStart = (U32*) (ctx->seqStore.buffer);
     ctx->seqStore.offCodeStart = (BYTE*) (ctx->seqStore.offsetStart + (BLOCKSIZE>>2));
     ctx->seqStore.litStart = ctx->seqStore.offCodeStart + (BLOCKSIZE>>2);
@@ -512,7 +506,7 @@ static size_t ZSTD_compressRle (void* dst, size_t maxDstSize, const void* src, s
     /* Build header */
     ostart[0]  = (BYTE)(srcSize>>16);
     ostart[1]  = (BYTE)(srcSize>>8);
-    ostart[2]  = (BYTE)srcSize;
+    ostart[2]  = (BYTE) srcSize;
     ostart[0] += (BYTE)(bt_rle<<6);
 
     return ZSTD_blockHeaderSize+1;
@@ -527,73 +521,15 @@ static size_t ZSTD_noCompressBlock (void* dst, size_t maxDstSize, const void* sr
     memcpy(ostart + ZSTD_blockHeaderSize, src, srcSize);
 
     /* Build header */
-    ostart[0] = (BYTE)(srcSize>>16);
-    ostart[1] = (BYTE)(srcSize>>8);
-    ostart[2] = (BYTE)srcSize;
+    ostart[0]  = (BYTE)(srcSize>>16);
+    ostart[1]  = (BYTE)(srcSize>>8);
+    ostart[2]  = (BYTE) srcSize;
     ostart[0] += (BYTE)(bt_raw<<6);   /* is a raw (uncompressed) block */
 
     return ZSTD_blockHeaderSize+srcSize;
 }
 
 
-/* return : size of CStream in bits */
-static size_t ZSTD_compressLiterals_usingCTable(void* dst, size_t dstSize,
-                                          const void* src, size_t srcSize,
-                                          const FSE_CTable* CTable)
-{
-    const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
-    const BYTE* const iend = istart + srcSize;
-    FSE_CStream_t bitC;
-    FSE_CState_t CState1, CState2;
-
-    /* init */
-    (void)dstSize;   // objective : ensure it fits into dstBuffer (Todo)
-    FSE_initCStream(&bitC, dst);
-    FSE_initCState(&CState1, CTable);
-    CState2 = CState1;
-
-    /* Note : at this stage, srcSize > LITERALS_NOENTROPY (checked by ZSTD_compressLiterals()) */
-    // join to mod 2
-    if (srcSize & 1)
-    {
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-
-    // join to mod 4
-    if ((sizeof(size_t)*8 > LitFSELog*4+7 ) && (srcSize & 2))   // test bit 2
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *ip++);
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        FSE_flushBits(&bitC);
-    }
-
-    // 2 or 4 encoding per loop
-    while (ip<iend)
-    {
-        FSE_encodeSymbol(&bitC, &CState2, *ip++);
-
-        if (sizeof(size_t)*8 < LitFSELog*2+7 )   // this test must be static
-            FSE_flushBits(&bitC);
-
-        FSE_encodeSymbol(&bitC, &CState1, *ip++);
-
-        if (sizeof(size_t)*8 > LitFSELog*4+7 )   // this test must be static
-        {
-            FSE_encodeSymbol(&bitC, &CState2, *ip++);
-            FSE_encodeSymbol(&bitC, &CState1, *ip++);
-        }
-
-        FSE_flushBits(&bitC);
-    }
-
-    FSE_flushCState(&bitC, &CState2);
-    FSE_flushCState(&bitC, &CState1);
-    return FSE_closeCStream(&bitC);
-}
-
-
 size_t ZSTD_minGain(size_t srcSize)
 {
     return (srcSize >> 6) + 1;
@@ -603,101 +539,70 @@ size_t ZSTD_minGain(size_t srcSize)
 static size_t ZSTD_compressLiterals (void* dst, size_t dstSize,
                                      const void* src, size_t srcSize)
 {
-    const BYTE* const istart = (const BYTE*) src;
-    const BYTE* ip = istart;
-
-    BYTE* const ostart = (BYTE*) dst;
-    BYTE* op = ostart + ZSTD_blockHeaderSize;
-    BYTE* const oend = ostart + dstSize;
-
-    U32 maxSymbolValue = 256;
-    U32 tableLog = LitFSELog;
-    U32 count[256];
-    S16 norm[256];
-    U32 CTable[ FSE_CTABLE_SIZE_U32(LitFSELog, 256) ];
-    size_t errorCode;
     const size_t minGain = ZSTD_minGain(srcSize);
+    BYTE* const ostart = (BYTE*)dst;
+    size_t hsize;
+	static const size_t LHSIZE = 5;
 
-    /* early out */
-    if (dstSize < FSE_compressBound(srcSize)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+	if (dstSize < LHSIZE+1) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space for compression */
 
-    /* Scan input and build symbol stats */
-    errorCode = FSE_count (count, &maxSymbolValue, ip, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    if (errorCode == srcSize) return 1;
-    if (errorCode < (srcSize >> 6)) return 0;   /* cheap heuristic : probably not compressible enough */
+	hsize = HUF_compress(ostart+LHSIZE, dstSize-LHSIZE, src, srcSize);
+    if (hsize<2) return hsize;   /* special cases */
+    if (hsize >= srcSize - minGain) return 0;
 
-    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
-    errorCode = (int)FSE_normalizeCount (norm, tableLog, count, srcSize, maxSymbolValue);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-
-    /* Write table description header */
-    errorCode = FSE_writeNCount (op, FSE_MAX_HEADERSIZE, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    op += errorCode;
-
-    /* Compress */
-    errorCode = FSE_buildCTable (CTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    errorCode = ZSTD_compressLiterals_usingCTable(op, oend - op, ip, srcSize, CTable);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    op += errorCode;
-
-    /* check compressibility */
-    if ( (size_t)(op-ostart) >= srcSize-minGain)
-        return 0;
+    hsize += 2;  /* work around vs fixed 3-bytes header */
 
     /* Build header */
     {
-        size_t totalSize;
-        totalSize  = op - ostart - ZSTD_blockHeaderSize;
-        ostart[0]  = (BYTE)(totalSize>>16);
-        ostart[1]  = (BYTE)(totalSize>>8);
-        ostart[2]  = (BYTE)totalSize;
-        ostart[0] += (BYTE)(bt_compressed<<6); /* is a block, is compressed */
+        ostart[0]  = (BYTE)(bt_compressed<<6); /* is a block, is compressed */
+        ostart[0] += (BYTE)(hsize>>16);
+        ostart[1]  = (BYTE)(hsize>>8);
+        ostart[2]  = (BYTE)(hsize>>0);
+        ostart[0] += (BYTE)((srcSize>>16)<<3);
+        ostart[3]  = (BYTE)(srcSize>>8);
+        ostart[4]  = (BYTE)(srcSize>>0);
     }
 
-    return op-ostart;
+    hsize -= 2;
+    return hsize+LHSIZE;
 }
 
 
 static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
                                      const seqStore_t* seqStorePtr,
-                                     size_t lastLLSize, size_t srcSize)
+                                     size_t srcSize)
 {
-    FSE_CStream_t blockStream;
-    U32 count[256];
-    S16 norm[256];
+    U32 count[MaxSeq+1];
+    S16 norm[MaxSeq+1];
     size_t mostFrequent;
     U32 max = 255;
     U32 tableLog = 11;
     U32 CTable_LitLength  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL )];
     U32 CTable_OffsetBits [FSE_CTABLE_SIZE_U32(OffFSELog,MaxOff)];
     U32 CTable_MatchLength[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML )];
-    U32 LLtype, Offtype, MLtype;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const BYTE* const op_lit_start = seqStorePtr->litStart;
     const BYTE* op_lit = seqStorePtr->lit;
-    const BYTE* const op_litLength_start = seqStorePtr->litLengthStart;
+    const BYTE* const llTable = seqStorePtr->litLengthStart;
     const BYTE* op_litLength = seqStorePtr->litLength;
-    const U32*  op_offset = seqStorePtr->offset;
-    const BYTE* op_matchLength = seqStorePtr->matchLength;
-    const size_t nbSeq = op_litLength - op_litLength_start;
-    BYTE* op;
-    BYTE* offsetBits_start = seqStorePtr->offCodeStart;
-    BYTE* offsetBitsPtr = offsetBits_start;
+    const BYTE* const mlTable = seqStorePtr->matchLengthStart;
+    const U32*  const offsetTable = seqStorePtr->offsetStart;
+    BYTE* const offCodeTable = seqStorePtr->offCodeStart;
+    BYTE* op = dst;
+    BYTE* const oend = dst + maxDstSize;
+    const size_t nbSeq = op_litLength - llTable;
     const size_t minGain = ZSTD_minGain(srcSize);
     const size_t maxCSize = srcSize - minGain;
     const size_t minSeqSize = 1 /*lastL*/ + 2 /*dHead*/ + 2 /*dumpsIn*/ + 5 /*SeqHead*/ + 3 /*SeqIn*/ + 1 /*margin*/ + ZSTD_blockHeaderSize;
     const size_t maxLSize = maxCSize > minSeqSize ? maxCSize - minSeqSize : 0;
     BYTE* seqHead;
 
-    /* init */
-    op = dst;
 
-    /* Encode literals */
+    /* Compress literals */
     {
         size_t cSize;
         size_t litSize = op_lit - op_lit_start;
+
         if (litSize <= LITERAL_NOENTROPY) cSize = ZSTD_noCompressBlock (op, maxDstSize, op_lit_start, litSize);
         else
         {
@@ -713,13 +618,13 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         op += cSize;
     }
 
-    /* Encode Sequences */
-
-    /* seqHeader */
-    op += ZSTD_writeProgressive(op, lastLLSize);
+    /* Sequences Header */
+	if ((oend-op) < 2+3+6)  /* nbSeq + dumpsLength + 3*rleCTable*/
+		return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+    ZSTD_writeLE16(op, (U16)nbSeq); op+=2;
     seqHead = op;
 
-    /* dumps */
+    /* dumps : contains too large lengths */
     {
         size_t dumpsLength = seqStorePtr->dumps - seqStorePtr->dumpsStart;
         if (dumpsLength < 512)
@@ -735,11 +640,12 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
             op[2] = (BYTE)(dumpsLength);
             op += 3;
         }
+		if ((size_t)(oend-op) < dumpsLength+6) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
         memcpy(op, seqStorePtr->dumpsStart, dumpsLength);
         op += dumpsLength;
     }
 
-    /* Encoding table of Literal Lengths */
+    /* CTable for Literal Lengths */
     max = MaxLL;
     mostFrequent = FSE_countFast(count, &max, seqStorePtr->litLengthStart, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
@@ -755,30 +661,31 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_LitLength, norm, max, tableLog);
         LLtype = bt_compressed;
     }
 
-    /* Encoding table of Offsets */
+    /* CTable for Offsets codes */
     {
-        /* create OffsetBits */
+        /* create Offset codes */
         size_t i;
-        const U32* const op_offset_start = seqStorePtr->offsetStart;
         max = MaxOff;
         for (i=0; i<nbSeq; i++)
         {
-            offsetBits_start[i] = (BYTE)ZSTD_highbit(op_offset_start[i]) + 1;
-            if (op_offset_start[i]==0) offsetBits_start[i]=0;
+            offCodeTable[i] = (BYTE)ZSTD_highbit(offsetTable[i]) + 1;
+            if (offsetTable[i]==0) offCodeTable[i]=0;
         }
-        offsetBitsPtr += nbSeq;
-        mostFrequent = FSE_countFast(count, &max, offsetBits_start, nbSeq);
+        mostFrequent = FSE_countFast(count, &max, offCodeTable, nbSeq);
     }
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
     {
-        *op++ = *offsetBits_start;
+        *op++ = *offCodeTable;
         FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
         Offtype = bt_rle;
     }
@@ -789,14 +696,17 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog);
         Offtype = bt_compressed;
     }
 
-    /* Encoding Table of MatchLengths */
+    /* CTable for MatchLengths */
     max = MaxML;
     mostFrequent = FSE_countFast(count, &max, seqStorePtr->matchLengthStart, nbSeq);
     if ((mostFrequent == nbSeq) && (nbSeq > 2))
@@ -812,33 +722,40 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
     }
     else
     {
+		size_t NCountSize;
         tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
         FSE_normalizeCount(norm, tableLog, count, nbSeq, max);
-        op += FSE_writeNCount(op, maxDstSize, norm, max, tableLog);
+		NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+		if (FSE_isError(NCountSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+        op += NCountSize;
         FSE_buildCTable(CTable_MatchLength, norm, max, tableLog);
         MLtype = bt_compressed;
     }
 
     seqHead[0] += (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
 
-    /* Encoding */
+    /* Encoding Sequences */
     {
+        size_t streamSize, errorCode;
+        FSE_CStream_t blockStream;
         FSE_CState_t stateMatchLength;
         FSE_CState_t stateOffsetBits;
         FSE_CState_t stateLitLength;
+        int i;
 
-        FSE_initCStream(&blockStream, op);
+        errorCode = FSE_initCStream(&blockStream, op, oend-op);
+        if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space remaining */
         FSE_initCState(&stateMatchLength, CTable_MatchLength);
         FSE_initCState(&stateOffsetBits, CTable_OffsetBits);
         FSE_initCState(&stateLitLength, CTable_LitLength);
 
-        while (op_litLength > op_litLength_start)
+        for (i=(int)nbSeq-1; i>=0; i--)
         {
-            BYTE matchLength = *(--op_matchLength);
-            U32  offset = *(--op_offset);
-            BYTE offCode = *(--offsetBitsPtr);                              /* 32b*/  /* 64b*/
+            BYTE matchLength = mlTable[i];
+            U32  offset = offsetTable[i];
+            BYTE offCode = offCodeTable[i];                                 /* 32b*/  /* 64b*/
             U32 nbBits = (offCode-1) * (!!offCode);
-            BYTE litLength = *(--op_litLength);                             /* (7)*/  /* (7)*/
+            BYTE litLength = llTable[i];                                    /* (7)*/  /* (7)*/
             FSE_encodeSymbol(&blockStream, &stateMatchLength, matchLength); /* 17 */  /* 17 */
             if (ZSTD_32bits()) FSE_flushBits(&blockStream);                 /*  7 */
             FSE_addBits(&blockStream, offset, nbBits);                      /* 32 */  /* 42 */
@@ -851,9 +768,11 @@ static size_t ZSTD_compressSequences(BYTE* dst, size_t maxDstSize,
         FSE_flushCState(&blockStream, &stateMatchLength);
         FSE_flushCState(&blockStream, &stateOffsetBits);
         FSE_flushCState(&blockStream, &stateLitLength);
-    }
 
-    op += FSE_closeCStream(&blockStream);
+        streamSize = FSE_closeCStream(&blockStream);
+        if (streamSize==0) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* not enough space */
+        op += streamSize;
+    }
 
     /* check compressibility */
     if ((size_t)(op-dst) >= maxCSize) return 0;
@@ -952,7 +871,6 @@ static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const
     const BYTE* const ilimit = iend - 16;
 
     size_t prevOffset=0, offset=0;
-    size_t lastLLSize;
 
 
     /* init */
@@ -988,17 +906,19 @@ static size_t ZSTD_compressBlock(void* cctx, void* dst, size_t maxDstSize, const
     }
 
     /* Last Literals */
-    lastLLSize = iend - anchor;
-    memcpy(seqStorePtr->lit, anchor, lastLLSize);
-    seqStorePtr->lit += lastLLSize;
+    {
+        size_t lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
 
     /* Finale compression stage */
     return ZSTD_compressSequences((BYTE*)dst, maxDstSize,
-                                  seqStorePtr, lastLLSize, srcSize);
+                                  seqStorePtr, srcSize);
 }
 
 
-size_t ZSTD_compressBegin(ZSTD_Cctx*  ctx, void* dst, size_t maxDstSize)
+size_t ZSTD_compressBegin(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize)
 {
     /* Sanity check */
     if (maxDstSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
@@ -1092,14 +1012,13 @@ size_t ZSTD_compressContinue(ZSTD_Cctx*  cctx, void* dst, size_t maxDstSize, con
     const U32 updateRate = 2 * BLOCKSIZE;
 
     /*  Init */
-    if (maxDstSize < ZSTD_compressBound(srcSize) - 4 /* frame header size*/) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
     if (ctx->base==NULL)
         ctx->base = (const BYTE*)src, ctx->current=0, ctx->nextUpdate = g_maxDistance;
     if (src != ctx->base + ctx->current)   /* not contiguous */
     {
-            ZSTD_resetCCtx(ctx);
-            ctx->base = (const BYTE*)src;
-            ctx->current = 0;
+        ZSTD_resetCCtx(ctx);
+        ctx->base = (const BYTE*)src;
+        ctx->current = 0;
     }
     ctx->current += (U32)srcSize;
 
@@ -1109,8 +1028,11 @@ size_t ZSTD_compressContinue(ZSTD_Cctx*  cctx, void* dst, size_t maxDstSize, con
         size_t blockSize = BLOCKSIZE;
         if (blockSize > srcSize) blockSize = srcSize;
 
+		if (maxDstSize < 2*ZSTD_blockHeaderSize+1)  /* one RLE block + endMark */
+            return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+
         /* update hash table */
-        if (g_maxDistance <= BLOCKSIZE)   /* static test => all blocks are independent */
+        if (g_maxDistance <= BLOCKSIZE)   /* static test ; yes == blocks are independent */
         {
             ZSTD_resetCCtx(ctx);
             ctx->base = ip;
@@ -1168,7 +1090,6 @@ size_t ZSTD_compressEnd(ZSTD_Cctx*  ctx, void* dst, size_t maxDstSize)
 static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE* const)dst;
-    BYTE* const oend = ostart + maxDstSize;
     BYTE* op = ostart;
 
     /* Header */
@@ -1181,7 +1102,7 @@ static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, co
 
     /* Compression */
     {
-        size_t cSize = ZSTD_compressContinue(ctx, op, oend-op, src, srcSize);
+        size_t cSize = ZSTD_compressContinue(ctx, op, maxDstSize, src, srcSize);
         if (ZSTD_isError(cSize)) return cSize;
         op += cSize;
         maxDstSize -= cSize;
@@ -1189,7 +1110,7 @@ static size_t ZSTD_compressCCtx(ZSTD_Cctx* ctx, void* dst, size_t maxDstSize, co
 
     /* Close frame */
     {
-        size_t endSize = ZSTD_compressEnd(ctx, op, oend-op);
+        size_t endSize = ZSTD_compressEnd(ctx, op, maxDstSize);
         if(ZSTD_isError(endSize)) return endSize;
         op += endSize;
     }
@@ -1204,13 +1125,13 @@ size_t ZSTD_compress(void* dst, size_t maxDstSize, const void* src, size_t srcSi
     size_t r;
 
     ctx = ZSTD_createCCtx();
+    if (ctx==NULL) return (size_t)-ZSTD_ERROR_GENERIC;
     r = ZSTD_compressCCtx(ctx, dst, maxDstSize, src, srcSize);
     ZSTD_freeCCtx(ctx);
     return r;
 }
 
 
-
 /**************************************************************
 *   Decompression code
 **************************************************************/
@@ -1221,7 +1142,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp
     BYTE headerFlags;
     U32 cSize;
 
-    if (srcSize < 3) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    if (srcSize < 3) return (size_t)-ZSTD_ERROR_SrcSize;
 
     headerFlags = *in;
     cSize = in[2] + (in[1]<<8) + ((in[0] & 7)<<16);
@@ -1243,106 +1164,29 @@ static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const voi
 }
 
 
-/* force inline : 'fast' really needs to be evaluated at compile time */
-FORCE_INLINE size_t ZSTD_decompressLiterals_usingDTable_generic(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const FSE_DTable* DTable, U32 fast)
-{
-    BYTE* op = (BYTE*) dst;
-    BYTE* const olimit = op;
-    BYTE* const oend = op + maxDstSize;
-    FSE_DStream_t bitD;
-    FSE_DState_t state1, state2;
-    size_t errorCode;
-
-    /* Init */
-    errorCode = FSE_initDStream(&bitD, src, srcSize);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-
-    FSE_initDState(&state1, &bitD, DTable);
-    FSE_initDState(&state2, &bitD, DTable);
-    op = oend;
-
-    /* 2-4 symbols per loop */
-    while (!FSE_reloadDStream(&bitD) && (op>olimit+3))
-    {
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if (LitFSELog*2+7 > sizeof(size_t)*8)    /* This test must be static */
-            FSE_reloadDStream(&bitD);
-
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-
-        if (LitFSELog*4+7 < sizeof(size_t)*8)    /* This test must be static */
-        {
-            *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-            *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-        }
-    }
-
-    /* tail */
-    while (1)
-    {
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state1) && FSE_endOfDStream(&bitD)) )
-            break;
-
-        *--op = fast ? FSE_decodeSymbolFast(&state1, &bitD) : FSE_decodeSymbol(&state1, &bitD);
-
-        if ( (FSE_reloadDStream(&bitD)>2) || (op==olimit) || (FSE_endOfDState(&state2) && FSE_endOfDStream(&bitD)) )
-            break;
-
-        *--op = fast ? FSE_decodeSymbolFast(&state2, &bitD) : FSE_decodeSymbol(&state2, &bitD);
-    }
-
-    /* end ? */
-    if (FSE_endOfDStream(&bitD) && FSE_endOfDState(&state1) && FSE_endOfDState(&state2) )
-        return oend-op;
-
-    if (op==olimit) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* dst buffer is full, but cSrc unfinished */
-
-    return (size_t)-ZSTD_ERROR_GENERIC;
-}
-
-static size_t ZSTD_decompressLiterals_usingDTable(
-                       void* const dst, size_t maxDstSize,
-                 const void* src, size_t srcSize,
-                 const FSE_DTable* DTable, U32 fast)
-{
-    if (fast) return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 1);
-    return ZSTD_decompressLiterals_usingDTable_generic(dst, maxDstSize, src, srcSize, DTable, 0);
-}
-
-static size_t ZSTD_decompressLiterals(void* ctx, void* dst, size_t maxDstSize,
+static size_t ZSTD_decompressLiterals(void* ctx,
+                                      void* dst, size_t maxDstSize,
                                 const void* src, size_t srcSize)
 {
-    /* assumed : blockType == blockCompressed */
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + maxDstSize;
     const BYTE* ip = (const BYTE*)src;
-    short norm[256];
-    FSE_DTable* DTable = (FSE_DTable*)ctx;
-    U32 maxSymbolValue = 255;
-    U32 tableLog;
-    U32 fastMode;
     size_t errorCode;
+    size_t litSize = ip[1] + (ip[0]<<8);
+    litSize += ((ip[-3] >> 3) & 7) << 16;   // mmmmh....
+    op = oend - litSize;
 
-    if (srcSize < 2) return (size_t)-ZSTD_ERROR_wrongLBlockSize;   /* too small input size */
-
-    errorCode = FSE_readNCount (norm, &maxSymbolValue, &tableLog, ip, srcSize);
+    (void)ctx;
+    if (litSize > maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+    errorCode = HUF_decompress(op, litSize, ip+2, srcSize-2);
     if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    ip += errorCode;
-    srcSize -= errorCode;
-
-    errorCode = FSE_buildDTable (DTable, norm, maxSymbolValue, tableLog);
-    if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_GENERIC;
-    fastMode = (U32)errorCode;
-
-    return ZSTD_decompressLiterals_usingDTable (dst, maxDstSize, ip, srcSize, DTable, fastMode);
+    return litSize;
 }
 
 
 size_t ZSTD_decodeLiteralsBlock(void* ctx,
                                 void* dst, size_t maxDstSize,
-                          const BYTE** litPtr,
+                          const BYTE** litStart, size_t* litSize,
                           const void* src, size_t srcSize)
 {
     const BYTE* const istart = (const BYTE* const)src;
@@ -1353,25 +1197,32 @@ size_t ZSTD_decodeLiteralsBlock(void* ctx,
 
     size_t litcSize = ZSTD_getcBlockSize(src, srcSize, &litbp);
     if (ZSTD_isError(litcSize)) return litcSize;
-    if (litcSize > srcSize - ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_wrongLBlockSize;
+    if (litcSize > srcSize - ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
     ip += ZSTD_blockHeaderSize;
 
     switch(litbp.blockType)
     {
-    case bt_raw: *litPtr = ip; ip+= litcSize; break;
+    case bt_raw:
+        *litStart = ip;
+        ip += litcSize;
+        *litSize = litcSize;
+        break;
     case bt_rle:
         {
             size_t rleSize = litbp.origSize;
+            if (rleSize>maxDstSize) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
             memset(oend - rleSize, *ip, rleSize);
-            *litPtr = oend - rleSize;
+            *litStart = oend - rleSize;
+            *litSize = rleSize;
             ip++;
             break;
         }
     case bt_compressed:
         {
-            size_t litSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
-            if (ZSTD_isError(litSize)) return litSize;
-            *litPtr = oend - litSize;
+            size_t decodedLitSize = ZSTD_decompressLiterals(ctx, dst, maxDstSize, ip, litcSize);
+            if (ZSTD_isError(decodedLitSize)) return decodedLitSize;
+            *litStart = oend - decodedLitSize;
+            *litSize = decodedLitSize;
             ip += litcSize;
             break;
         }
@@ -1383,7 +1234,7 @@ size_t ZSTD_decodeLiteralsBlock(void* ctx,
 }
 
 
-size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
+size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr,
                          FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb,
                          const void* src, size_t srcSize)
 {
@@ -1394,8 +1245,11 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
     U32 LLlog, Offlog, MLlog;
     size_t dumpsLength;
 
+	/* check */
+	if (srcSize < 5) return (size_t)-ZSTD_ERROR_SrcSize;
+
     /* SeqHead */
-    ip += ZSTD_readProgressive(lastLLPtr, ip);
+    *nbSeq = ZSTD_readLE16(ip); ip+=2;
     LLtype  = *ip >> 6;
     Offtype = (*ip >> 4) & 3;
     MLtype  = (*ip >> 2) & 3;
@@ -1414,6 +1268,9 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
     *dumpsPtr = ip;
     ip += dumpsLength;
 
+	/* check */
+	if (ip > iend-1) return (size_t)-ZSTD_ERROR_SrcSize;
+
     /* sequences */
     {
         S16 norm[MaxML+1];    /* assumption : MaxML >= MaxLL and MaxOff */
@@ -1433,6 +1290,7 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
             max = MaxLL;
             headerSize = FSE_readNCount(norm, &max, &LLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (LLlog > LLFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableLL, norm, max, LLlog);
         }
@@ -1450,6 +1308,7 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
             max = MaxOff;
             headerSize = FSE_readNCount(norm, &max, &Offlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (Offlog > OffFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableOffb, norm, max, Offlog);
         }
@@ -1467,6 +1326,7 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
             max = MaxML;
             headerSize = FSE_readNCount(norm, &max, &MLlog, ip, iend-ip);
             if (FSE_isError(headerSize)) return (size_t)-ZSTD_ERROR_GENERIC;
+			if (MLlog > MLFSELog) return (size_t)-ZSTD_ERROR_corruption;
             ip += headerSize;
             FSE_buildDTable(DTableML, norm, max, MLlog);
         }
@@ -1476,174 +1336,262 @@ size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr,
 }
 
 
-#define ZSTD_prefetch(p) { const BYTE pByte = *(volatile const BYTE*)p; }
+typedef struct {
+    size_t litLength;
+    size_t offset;
+    size_t matchLength;
+} seq_t;
 
-FORCE_INLINE size_t ZSTD_decompressBlock(void* ctx, void* dst, size_t maxDstSize,
-                             const void* src, size_t srcSize)
+typedef struct {
+    FSE_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset;
+    const BYTE* dumps;
+} seqState_t;
+
+
+static void ZSTD_decodeSequence(seq_t* seq, seqState_t* seqState)
 {
-    const BYTE* ip = (const BYTE*)src;
-    const BYTE* const iend = ip + srcSize;
+    size_t litLength;
+    size_t prevOffset;
+    size_t offset;
+    size_t matchLength;
+    const BYTE* dumps = seqState->dumps;
+
+    /* Literal length */
+    litLength = FSE_decodeSymbol(&(seqState->stateLL), &(seqState->DStream));
+    prevOffset = litLength ? seq->offset : seqState->prevOffset;
+    seqState->prevOffset = seq->offset;
+    if (litLength == MaxLL)
+    {
+        U32 add = *dumps++;
+        if (add < 255) litLength += add;
+        else
+        {
+            litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
+            dumps += 3;
+        }
+    }
+
+    /* Offset */
+    {
+        U32 offsetCode, nbBits;
+        offsetCode = FSE_decodeSymbol(&(seqState->stateOffb), &(seqState->DStream));
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        nbBits = offsetCode - 1;
+        if (offsetCode==0) nbBits = 0;   /* cmove */
+        offset = ((size_t)1 << nbBits) + FSE_readBits(&(seqState->DStream), nbBits);
+        if (ZSTD_32bits()) FSE_reloadDStream(&(seqState->DStream));
+        if (offsetCode==0) offset = prevOffset;
+    }
+
+    /* MatchLength */
+    matchLength = FSE_decodeSymbol(&(seqState->stateML), &(seqState->DStream));
+    if (matchLength == MaxML)
+    {
+        U32 add = *dumps++;
+        if (add < 255) matchLength += add;
+        else
+        {
+            matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;   /* no pb : dumps is always followed by seq tables > 1 byte */
+            dumps += 3;
+        }
+    }
+    matchLength += MINMATCH;
+
+    /* save result */
+    seq->litLength = litLength;
+    seq->offset = offset;
+    seq->matchLength = matchLength;
+    seqState->dumps = dumps;
+}
+
+
+static size_t ZSTD_execSequence(BYTE* op,
+								seq_t sequence,
+								const BYTE** litPtr, const BYTE* const litLimit,
+								BYTE* const base, BYTE* const oend)
+{
+    static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+    static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
+    const BYTE* const ostart = op;
+    const size_t litLength = sequence.litLength;
+    BYTE* const endMatch = op + litLength + sequence.matchLength;    /* risk : address space overflow (32-bits) */
+    const BYTE* const litEnd = *litPtr + litLength;
+
+    /* check */
+    if (endMatch > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;   /* overwrite beyond dst buffer */
+	if (litEnd > litLimit) return (size_t)-ZSTD_ERROR_corruption;
+    if (sequence.matchLength > (size_t)(*litPtr-op))  return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;    /* overwrite literal segment */
+
+    /* copy Literals */
+    if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8))
+        memmove(op, *litPtr, litLength);   /* overwrite risk */
+    else
+        ZSTD_wildcopy(op, *litPtr, litLength);
+    op += litLength;
+    *litPtr = litEnd;   /* update for next sequence */
+
+    /* check : last match must be at a minimum distance of 8 from end of dest buffer */
+	if (oend-op < 8) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+
+	/* copy Match */
+    {
+        const U32 overlapRisk = (((size_t)(litEnd - endMatch)) < 12);
+        const BYTE* match = op - sequence.offset;            /* possible underflow at op - offset ? */
+        size_t qutt = 12;
+        U64 saved[2];
+
+		/* check */
+		if (match < base) return (size_t)-ZSTD_ERROR_corruption;
+		if (sequence.offset > (size_t)base) return (size_t)-ZSTD_ERROR_corruption;
+
+        /* save beginning of literal sequence, in case of write overlap */
+        if (overlapRisk)
+        {
+            if ((endMatch + qutt) > oend) qutt = oend-endMatch;
+            memcpy(saved, endMatch, qutt);
+        }
+
+        if (sequence.offset < 8)
+        {
+            const int dec64 = dec64table[sequence.offset];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[sequence.offset];
+            ZSTD_copy4(op+4, match);
+            match -= dec64;
+        } else { ZSTD_copy8(op, match); }
+		op += 8; match += 8;
+
+        if (endMatch > oend-12)
+        {
+            if (op < oend-8)
+            {
+                ZSTD_wildcopy(op, match, (oend-8) - op);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<endMatch) *op++ = *match++;
+        }
+        else
+            ZSTD_wildcopy(op, match, sequence.matchLength-8);   /* works even if matchLength < 8 */
+
+        /* restore, in case of overlap */
+        if (overlapRisk) memcpy(endMatch, saved, qutt);
+    }
+
+    return endMatch-ostart;
+}
+
+typedef struct ZSTD_Dctx_s
+{
+    U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+	U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+	U32 MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    void* previousDstEnd;
+    void* base;
+	size_t expected;
+    blockType_t bType;
+    U32 phase;
+} dctx_t;
+
+
+static size_t ZSTD_decompressSequences(
+                               void* ctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize,
+                         const BYTE* litStart, size_t litSize)
+{
+	dctx_t* dctx = (dctx_t*)ctx;
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE* const)dst;
     BYTE* op = ostart;
     BYTE* const oend = ostart + maxDstSize;
     size_t errorCode;
-    size_t lastLLSize;
+    const BYTE* litPtr = litStart;
+    const BYTE* const litEnd = litStart + litSize;
+    int nbSeq;
     const BYTE* dumps;
-    const BYTE* litPtr;
-    const BYTE* litEnd;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};   /* added */
-    const size_t dec64table[] = {8, 8, 8, 7, 8, 9,10,11};   /* substracted */
-    FSE_DTable* DTableML = (FSE_DTable*)ctx;
-    FSE_DTable* DTableLL = DTableML + FSE_DTABLE_SIZE_U32(MLFSELog);
-    FSE_DTable* DTableOffb = DTableLL + FSE_DTABLE_SIZE_U32(LLFSELog);
-
-    /* blockType == blockCompressed, srcSize is trusted */
-
-    /* literal sub-block */
-    errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, src, srcSize);
-    if (ZSTD_isError(errorCode)) return errorCode;
-    ip += errorCode;
+    U32* DTableLL = dctx->LLTable;
+	U32* DTableML = dctx->MLTable;
+    U32* DTableOffb = dctx->OffTable;
+	BYTE* const base = (BYTE*) (dctx->base);
 
     /* Build Decoding Tables */
-    errorCode = ZSTD_decodeSeqHeaders(&lastLLSize, &dumps,
+    errorCode = ZSTD_decodeSeqHeaders(&nbSeq, &dumps,
                                       DTableLL, DTableML, DTableOffb,
                                       ip, iend-ip);
     if (ZSTD_isError(errorCode)) return errorCode;
-    /* end pos */
-    if ((litPtr>=ostart) && (litPtr<=oend))   /* decoded literals are into dst buffer */
-        litEnd = oend - lastLLSize;
-    else
-        litEnd = ip - lastLLSize;
     ip += errorCode;
 
-    /* decompression */
+    /* Regen sequences */
     {
-        FSE_DStream_t DStream;
-        FSE_DState_t stateLL, stateOffb, stateML;
-        size_t prevOffset = 0, offset = 0;
+        seq_t sequence;
+        seqState_t seqState;
 
-        FSE_initDStream(&DStream, ip, iend-ip);
-        FSE_initDState(&stateLL, &DStream, DTableLL);
-        FSE_initDState(&stateOffb, &DStream, DTableOffb);
-        FSE_initDState(&stateML, &DStream, DTableML);
+        memset(&sequence, 0, sizeof(sequence));
+        seqState.dumps = dumps;
+        seqState.prevOffset = 1;
+        errorCode = FSE_initDStream(&(seqState.DStream), ip, iend-ip);
+        if (FSE_isError(errorCode)) return (size_t)-ZSTD_ERROR_corruption;
+        FSE_initDState(&(seqState.stateLL), &(seqState.DStream), DTableLL);
+        FSE_initDState(&(seqState.stateOffb), &(seqState.DStream), DTableOffb);
+        FSE_initDState(&(seqState.stateML), &(seqState.DStream), DTableML);
 
-        while (FSE_reloadDStream(&DStream)<2)
+        for ( ; (FSE_reloadDStream(&(seqState.DStream)) < FSE_DStream_completed) || (nbSeq>0) ; )
         {
-            U32 nbBits, offsetCode;
-            const BYTE* match;
-            size_t litLength;
-            size_t matchLength;
-            size_t newOffset;
-
-_another_round:
-
-            /* Literals */
-            litLength = FSE_decodeSymbol(&stateLL, &DStream);
-            if (litLength) prevOffset = offset;
-            if (litLength == MaxLL)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) litLength += add;
-                else
-                {
-                    litLength = ZSTD_readLE32(dumps) & 0xFFFFFF;
-                    dumps += 3;
-                }
-            }
-            if (((size_t)(litPtr - op) < 8) || ((size_t)(oend-(litPtr+litLength)) < 8))
-                memmove(op, litPtr, litLength);   /* overwrite risk */
-            else
-                ZSTD_wildcopy(op, litPtr, litLength);
-            op += litLength;
-            litPtr += litLength;
-
-            /* Offset */
-            offsetCode = FSE_decodeSymbol(&stateOffb, &DStream);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            nbBits = offsetCode - 1;
-            if (offsetCode==0) nbBits = 0;   /* cmove */
-            newOffset = FSE_readBits(&DStream, nbBits);
-            if (ZSTD_32bits()) FSE_reloadDStream(&DStream);
-            newOffset += (size_t)1 << nbBits;
-            if (offsetCode==0) newOffset = prevOffset;
-            match = op - newOffset;
-            prevOffset = offset;
-            offset = newOffset;
-
-            /* MatchLength */
-            matchLength = FSE_decodeSymbol(&stateML, &DStream);
-            if (matchLength == MaxML)
-            {
-                BYTE add = *dumps++;
-                if (add < 255) matchLength += add;
-                else
-                {
-                    matchLength = ZSTD_readLE32(dumps) & 0xFFFFFF;   /* no pb : dumps is always followed by seq tables > 1 byte */
-                    dumps += 3;
-                }
-            }
-            matchLength += MINMATCH;
-
-            /* copy Match */
-            {
-                BYTE* const endMatch = op + matchLength;
-                size_t qutt=0;
-                U64 saved[2];
-
-                /* save beginning of literal sequence, in case of write overlap */
-                if ((size_t)(litPtr - endMatch) < 12)
-                {
-                    qutt = endMatch + 12 - litPtr;
-                    if ((litPtr + qutt) > oend) qutt = oend-litPtr;
-                    memcpy(saved, litPtr, qutt);
-                }
-
-                if (offset < 8)
-                {
-                    const size_t dec64 = dec64table[offset];
-                    op[0] = match[0];
-                    op[1] = match[1];
-                    op[2] = match[2];
-                    op[3] = match[3];
-                    match += dec32table[offset];
-                    ZSTD_copy4(op+4, match);
-                    match -= dec64;
-                } else { ZSTD_copy8(op, match); }
-
-                if (endMatch > oend-12)
-                {
-                    if (op < oend-16)
-                    {
-                        ZSTD_wildcopy(op+8, match+8, (oend-8) - (op+8));
-                        match += (oend-8) - op;
-                        op = oend-8;
-                    }
-                    while (op<endMatch) *op++ = *match++;
-                }
-                else
-                    ZSTD_wildcopy(op+8, match+8, matchLength-8);   /* works even if matchLength < 8 */
-
-                op = endMatch;
-
-                if ((size_t)(litPtr - endMatch) < 12)
-                    memcpy(endMatch + (litPtr - endMatch), saved, qutt);  /* required as litPtr is const ptr */
-            }
+            size_t oneSeqSize;
+            nbSeq--;
+            ZSTD_decodeSequence(&sequence, &seqState);
+            oneSeqSize = ZSTD_execSequence(op, sequence, &litPtr, litEnd, base, oend);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
         }
 
         /* check if reached exact end */
-        if (FSE_reloadDStream(&DStream) > 2) return (size_t)-ZSTD_ERROR_GENERIC;   /* requested too much : data is corrupted */
-        if (!FSE_endOfDState(&stateLL) && !FSE_endOfDState(&stateML) && !FSE_endOfDState(&stateOffb)) goto _another_round;   /* some ultra-compressible sequence remain ! */
-        if (litPtr != litEnd) goto _another_round;   /* literals not entirely spent */
+        if (FSE_reloadDStream(&(seqState.DStream)) > FSE_DStream_completed) return (size_t)-ZSTD_ERROR_corruption;   /* requested too much : data is corrupted */
+        if (nbSeq<0) return (size_t)-ZSTD_ERROR_corruption;   /* requested too many sequences : data is corrupted */
 
         /* last literal segment */
-        if (op != litPtr) memmove(op, litPtr, lastLLSize);
-        op += lastLLSize;
+        {
+            size_t lastLLSize = litEnd - litPtr;
+            if (op+lastLLSize > oend) return (size_t)-ZSTD_ERROR_maxDstSize_tooSmall;
+            if (op != litPtr) memmove(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
     }
 
     return op-ostart;
 }
 
 
+static size_t ZSTD_decompressBlock(
+                            void* ctx,
+                            void* dst, size_t maxDstSize,
+                      const void* src, size_t srcSize)
+{
+    /* blockType == blockCompressed, srcSize is trusted */
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* litPtr;
+    size_t litSize;
+    size_t errorCode;
+
+    /* Decode literals sub-block */
+    errorCode = ZSTD_decodeLiteralsBlock(ctx, dst, maxDstSize, &litPtr, &litSize, src, srcSize);
+    if (ZSTD_isError(errorCode)) return errorCode;
+    ip += errorCode;
+    srcSize -= errorCode;
+
+    return ZSTD_decompressSequences(ctx, dst, maxDstSize, ip, srcSize, litPtr, litSize);
+}
+
+
 static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
     const BYTE* ip = (const BYTE*)src;
@@ -1656,22 +1604,21 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     size_t errorCode=0;
     blockProperties_t blockProperties;
 
-    /* Header */
-    if (srcSize < ZSTD_frameHeaderSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    /* Frame Header */
+    if (srcSize < ZSTD_frameHeaderSize+ZSTD_blockHeaderSize) return (size_t)-ZSTD_ERROR_SrcSize;
     magicNumber = ZSTD_readBE32(src);
-    if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
+    if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_MagicNumber;
     ip += ZSTD_frameHeaderSize; remainingSize -= ZSTD_frameHeaderSize;
 
+    /* Loop on each block */
     while (1)
     {
         size_t blockSize = ZSTD_getcBlockSize(ip, iend-ip, &blockProperties);
-        if (ZSTD_isError(blockSize))
-            return blockSize;
+        if (ZSTD_isError(blockSize)) return blockSize;
 
         ip += ZSTD_blockHeaderSize;
         remainingSize -= ZSTD_blockHeaderSize;
-        if (ip+blockSize > iend)
-            return (size_t)-ZSTD_ERROR_wrongSrcSize;
+        if (blockSize > remainingSize) return (size_t)-ZSTD_ERROR_SrcSize;
 
         switch(blockProperties.blockType)
         {
@@ -1682,11 +1629,11 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
             errorCode = ZSTD_copyUncompressedBlock(op, oend-op, ip, blockSize);
             break;
         case bt_rle :
-            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet handled */
+            return (size_t)-ZSTD_ERROR_GENERIC;   /* not yet supported */
             break;
         case bt_end :
             /* end of frame */
-            if (remainingSize) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+            if (remainingSize) return (size_t)-ZSTD_ERROR_SrcSize;
             break;
         default:
             return (size_t)-ZSTD_ERROR_GENERIC;
@@ -1702,11 +1649,11 @@ static size_t ZSTD_decompressDCtx(void* ctx, void* dst, size_t maxDstSize, const
     return op-ostart;
 }
 
-
 size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t srcSize)
 {
-    U32 ctx[FSE_DTABLE_SIZE_U32(LLFSELog) + FSE_DTABLE_SIZE_U32(OffFSELog) + FSE_DTABLE_SIZE_U32(MLFSELog)];
-    return ZSTD_decompressDCtx(ctx, dst, maxDstSize, src, srcSize);
+	dctx_t ctx;
+	ctx.base = dst;
+    return ZSTD_decompressDCtx(&ctx, dst, maxDstSize, src, srcSize);
 }
 
 
@@ -1714,21 +1661,14 @@ size_t ZSTD_decompress(void* dst, size_t maxDstSize, const void* src, size_t src
 *  Streaming Decompression API
 *******************************/
 
-typedef struct ZSTD_Dctx_s
-{
-    U32 ctx[FSE_DTABLE_SIZE_U32(LLFSELog) + FSE_DTABLE_SIZE_U32(OffFSELog) + FSE_DTABLE_SIZE_U32(MLFSELog)];
-    size_t expected;
-    blockType_t bType;
-    U32 phase;
-} dctx_t;
-
-
 ZSTD_Dctx* ZSTD_createDCtx(void)
 {
     ZSTD_Dctx* dctx = (ZSTD_Dctx*)malloc(sizeof(ZSTD_Dctx));
     if (dctx==NULL) return NULL;
     dctx->expected = ZSTD_frameHeaderSize;
     dctx->phase = 0;
+	dctx->previousDstEnd = NULL;
+	dctx->base = NULL;
     return dctx;
 }
 
@@ -1749,14 +1689,16 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
     dctx_t* ctx = (dctx_t*)dctx;
 
     /* Sanity check */
-    if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_wrongSrcSize;
+    if (srcSize != ctx->expected) return (size_t)-ZSTD_ERROR_SrcSize;
+	if (dst != ctx->previousDstEnd)  /* not contiguous */
+		ctx->base = dst;
 
     /* Decompress : frame header */
     if (ctx->phase == 0)
     {
         /* Check frame magic header */
         U32 magicNumber = ZSTD_readBE32(src);
-        if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_wrongMagicNumber;
+        if (magicNumber != ZSTD_magicNumber) return (size_t)-ZSTD_ERROR_MagicNumber;
         ctx->phase = 1;
         ctx->expected = ZSTD_blockHeaderSize;
         return 0;
@@ -1805,6 +1747,7 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
         }
         ctx->phase = 1;
         ctx->expected = ZSTD_blockHeaderSize;
+		ctx->previousDstEnd = (void*)( ((char*)dst) + rSize);
         return rSize;
     }
 
diff --git a/lib/zstd.h b/lib/zstd.h
index 50ee72c4..1e20b4f1 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -46,8 +46,8 @@ extern "C" {
 *  Version
 **************************************/
 #define ZSTD_VERSION_MAJOR    0    /* for breaking interface changes  */
-#define ZSTD_VERSION_MINOR    0    /* for new (non-breaking) interface capabilities */
-#define ZSTD_VERSION_RELEASE  2    /* for tweaks, bug-fixes, or development */
+#define ZSTD_VERSION_MINOR    1    /* for new (non-breaking) interface capabilities */
+#define ZSTD_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 unsigned ZSTD_versionNumber (void);
 
@@ -64,8 +64,8 @@ size_t ZSTD_decompress( void* dst, size_t maxOriginalSize,
 /*
 ZSTD_compress() :
     Compresses 'srcSize' bytes from buffer 'src' into buffer 'dst', of maximum size 'dstSize'.
-    Destination buffer should be sized to handle worst cases situations (input data not compressible).
-    Worst case size evaluation is provided by function ZSTD_compressBound().
+    Destination buffer must be already allocated.
+    Compression runs faster if maxDstSize >=  ZSTD_compressBound(srcSize).
     return : the number of bytes written into buffer 'dst'
              or an error code if it fails (which can be tested using ZSTD_isError())
 
diff --git a/lib/zstd_static.h b/lib/zstd_static.h
index a059288f..1baa47d3 100644
--- a/lib/zstd_static.h
+++ b/lib/zstd_static.h
@@ -74,9 +74,9 @@ size_t ZSTD_decompressContinue(ZSTD_Dctx* dctx, void* dst, size_t maxDstSize, co
 **************************************/
 #define ZSTD_LIST_ERRORS(ITEM) \
         ITEM(ZSTD_OK_NoError) ITEM(ZSTD_ERROR_GENERIC) \
-        ITEM(ZSTD_ERROR_wrongMagicNumber) \
-        ITEM(ZSTD_ERROR_wrongSrcSize) ITEM(ZSTD_ERROR_maxDstSize_tooSmall) \
-        ITEM(ZSTD_ERROR_wrongLBlockSize) \
+        ITEM(ZSTD_ERROR_MagicNumber) \
+        ITEM(ZSTD_ERROR_SrcSize) ITEM(ZSTD_ERROR_maxDstSize_tooSmall) \
+        ITEM(ZSTD_ERROR_corruption) \
         ITEM(ZSTD_ERROR_maxCode)
 
 #define ZSTD_GENERATE_ENUM(ENUM) ENUM,
diff --git a/programs/.gitignore b/programs/.gitignore
new file mode 100644
index 00000000..021e8937
--- /dev/null
+++ b/programs/.gitignore
@@ -0,0 +1,31 @@
+# local binary (Makefile)
+zstd
+zstd32
+fullbench
+fullbench32
+fuzzer
+fuzzer32
+datagen
+
+# Object files
+*.o
+*.ko
+
+# Libraries
+*.lib
+*.a
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# Visual solution files
+*.suo
+*.user
diff --git a/programs/Makefile b/programs/Makefile
index 20205b90..b90cfdce 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -30,7 +30,7 @@
 # fullbench32: Same as fullbench, but forced to compile in 32-bits mode
 # ##########################################################################
 
-RELEASE?= v0.0.2
+RELEASE?= v0.1.0
 
 DESTDIR?=
 PREFIX ?= /usr
@@ -61,7 +61,7 @@ default: zstd
 
 all: zstd zstd32 fullbench fullbench32 fuzzer fuzzer32 datagen
 
-zstd: $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
+zstd  : $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
 zstd32: $(ZSTDDIR)/zstd.c xxhash.c bench.c fileio.c zstdcli.c
@@ -73,10 +73,10 @@ fullbench  : $(ZSTDDIR)/zstd.c datagen.c fullbench.c
 fullbench32: $(ZSTDDIR)/zstd.c datagen.c fullbench.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
-fuzzer  : $(ZSTDDIR)/zstd.c xxhash.c fuzzer.c
+fuzzer  : $(ZSTDDIR)/zstd.c datagen.c xxhash.c fuzzer.c
 	$(CC)      $(FLAGS) $^ -o $@$(EXT)
 
-fuzzer32: $(ZSTDDIR)/zstd.c xxhash.c fuzzer.c
+fuzzer32: $(ZSTDDIR)/zstd.c datagen.c xxhash.c fuzzer.c
 	$(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
 
 datagen : datagen.c datagencli.c
diff --git a/programs/bench.c b/programs/bench.c
index 31a43f32..5fdf0109 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -297,34 +297,37 @@ static int BMK_benchMem(void* srcBuffer, size_t srcSize, char* fileName, int cLe
             milliTime = BMK_GetMilliStart();
             while (BMK_GetMilliStart() == milliTime);
             milliTime = BMK_GetMilliStart();
-            while (BMK_GetMilliSpan(milliTime) < TIMELOOP)
+            for ( ; BMK_GetMilliSpan(milliTime) < TIMELOOP; nbLoops++)
             {
-                ZSTD_decompress(resultBuffer, srcSize, compressedBuffer, cSize);
-                nbLoops++;
+                size_t result = ZSTD_decompress(resultBuffer, srcSize, compressedBuffer, cSize);
+                if (ZSTD_isError(result))
+                {
+                    DISPLAY("\n!!! Decompression error !!! %s  !\n", ZSTD_getErrorName(result));
+                    break;
+                }
             }
             milliTime = BMK_GetMilliSpan(milliTime);
 
             if ((double)milliTime < fastestD*nbLoops) fastestD = (double)milliTime / nbLoops;
             DISPLAY("%1i-%-14.14s : %9i -> %9i (%5.2f%%),%7.1f MB/s ,%7.1f MB/s\r", loopNb, fileName, (int)srcSize, (int)cSize, ratio, (double)srcSize / fastestC / 1000., (double)srcSize / fastestD / 1000.);
-#endif
 
             /* CRC Checking */
             crcCheck = XXH64(resultBuffer, srcSize, 0);
             if (crcOrig!=crcCheck)
             {
-                unsigned i = 0;
+                unsigned i;
                 DISPLAY("\n!!! WARNING !!! %14s : Invalid Checksum : %x != %x\n", fileName, (unsigned)crcOrig, (unsigned)crcCheck);
-                while (i<srcSize)
+                for (i=0; i<srcSize; i++)
                 {
                     if (((BYTE*)srcBuffer)[i] != ((BYTE*)resultBuffer)[i])
                     {
                         printf("\nDecoding error at pos %u   \n", i);
                         break;
                     }
-                    i++;
                 }
                 break;
             }
+#endif
         }
 
         if (crcOrig == crcCheck)
diff --git a/programs/datagen.c b/programs/datagen.c
index fa0e62a1..23d7d15f 100644
--- a/programs/datagen.c
+++ b/programs/datagen.c
@@ -153,7 +153,7 @@ void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, double match
         memset(buffPtr+pos, 0, size0);
         pos += size0;
         buffPtr[pos-1] = RDG_genChar(seed, lt);
-        return;
+        continue;
     }
 
     /* init */
@@ -200,7 +200,7 @@ void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba
 
 #define RDG_DICTSIZE  (32 KB)
 #define RDG_BLOCKSIZE (128 KB)
-void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed)
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed)
 {
     BYTE* buff = (BYTE*)malloc(RDG_DICTSIZE + RDG_BLOCKSIZE);
     U64 total = 0;
diff --git a/programs/datagen.h b/programs/datagen.h
index 89482dc2..03b06cae 100644
--- a/programs/datagen.h
+++ b/programs/datagen.h
@@ -26,15 +26,15 @@
 
 #include <stddef.h>   /* size_t */
 
-void RDG_genOut(unsigned long long size, double matchProba, double litProba, unsigned seed);
+void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed);
 void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed);
 /* RDG_genBuffer
    Generate 'size' bytes of compressible data into 'buffer'.
-   Compressibility can be controlled using 'matchProba'.
-   'LitProba' is optional, and affect variability of individual bytes. If litProba==0.0, default value is used.
+   Compressibility can be controlled using 'matchProba', which is floating point value between 0 and 1.
+   'LitProba' is optional, it affect variability of individual bytes. If litProba==0.0, default value will be used.
    Generated data pattern can be modified using different 'seed'.
-   If (matchProba, litProba and seed) are equal, the function always generate the same content.
+   For a triplet (matchProba, litProba, seed), the function always generate the same content.
 
-   RDG_genOut
-   Same as RDG_genBuffer, but generate data towards stdout
+   RDG_genStdout
+   Same as RDG_genBuffer, but generates data into stdout
 */
diff --git a/programs/datagencli.c b/programs/datagencli.c
index 801e1980..2665c54b 100644
--- a/programs/datagencli.c
+++ b/programs/datagencli.c
@@ -183,7 +183,7 @@ int main(int argc, char** argv)
     DISPLAYLEVEL(3, "Seed = %u \n", seed);
     if (proba!=COMPRESSIBILITY_DEFAULT) DISPLAYLEVEL(3, "Compressibility : %i%%\n", (U32)(proba*100));
 
-    RDG_genOut(size, proba, litProba, seed);
+    RDG_genStdout(size, proba, litProba, seed);
     DISPLAYLEVEL(1, "\n");
 
     return 0;
diff --git a/programs/fullbench.c b/programs/fullbench.c
index b7ecc8db..a7dbac3b 100644
--- a/programs/fullbench.c
+++ b/programs/fullbench.c
@@ -229,9 +229,7 @@ typedef struct
 static size_t g_cSize = 0;
 
 extern size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr);
-extern size_t ZSTD_decodeLiteralsBlock(void* ctx, void* dst, size_t maxDstSize, const BYTE** litPtr, const void* src, size_t srcSize);
-extern size_t ZSTD_decodeSeqHeaders(size_t* lastLLPtr, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
-
+extern size_t ZSTD_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, FSE_DTable* DTableLL, FSE_DTable* DTableML, FSE_DTable* DTableOffb, const void* src, size_t srcSize);
 
 size_t local_ZSTD_compress(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
 {
@@ -245,12 +243,14 @@ size_t local_ZSTD_decompress(void* dst, size_t dstSize, void* buff2, const void*
     return ZSTD_decompress(dst, dstSize, buff2, g_cSize);
 }
 
+extern size_t ZSTD_decodeLiteralsBlock(void* ctx, void* dst, size_t maxDstSize, const BYTE** litStart, size_t* litSize, const void* src, size_t srcSize);
 size_t local_ZSTD_decodeLiteralsBlock(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
 {
     U32 ctx[1<<12];
     const BYTE* ll;
+    size_t llSize;
     (void)src; (void)srcSize;
-    ZSTD_decodeLiteralsBlock(ctx, dst, dstSize, &ll, buff2, g_cSize);
+    ZSTD_decodeLiteralsBlock(ctx, dst, dstSize, &ll, &llSize, buff2, g_cSize);
     return (const BYTE*)dst + dstSize - ll;
 }
 
@@ -258,9 +258,9 @@ size_t local_ZSTD_decodeSeqHeaders(void* dst, size_t dstSize, void* buff2, const
 {
     U32 DTableML[1<<11], DTableLL[1<<10], DTableOffb[1<<9];
     const BYTE* dumps;
-    size_t lastllSize;
+    int nbSeq;
     (void)src; (void)srcSize; (void)dst; (void)dstSize;
-    return ZSTD_decodeSeqHeaders(&lastllSize, &dumps, DTableLL, DTableML, DTableOffb, buff2, g_cSize);
+    return ZSTD_decodeSeqHeaders(&nbSeq, &dumps, DTableLL, DTableML, DTableOffb, buff2, g_cSize);
 }
 
 size_t local_conditionalNull(void* dst, size_t dstSize, void* buff2, const void* src, size_t srcSize)
diff --git a/programs/fuzzer.c b/programs/fuzzer.c
index db4bc65f..19ab9e3a 100644
--- a/programs/fuzzer.c
+++ b/programs/fuzzer.c
@@ -47,6 +47,7 @@
 #include <sys/timeb.h>   /* timeb */
 #include <string.h>      /* strcmp */
 #include "zstd_static.h"
+#include "datagen.h"     /* RDG_genBuffer */
 #include "xxhash.h"      /* XXH64 */
 
 
@@ -138,47 +139,7 @@ unsigned int FUZ_rand(unsigned int* src)
 }
 
 
-#define FUZ_RAND15BITS  (FUZ_rand(seed) & 0x7FFF)
-#define FUZ_RANDLENGTH  ( (FUZ_rand(seed) & 3) ? (FUZ_rand(seed) % 15) : (FUZ_rand(seed) % 510) + 15)
-static void FUZ_generateSynthetic(void* buffer, size_t bufferSize, double proba, U32* seed)
-{
-    BYTE* BBuffer = (BYTE*)buffer;
-    unsigned pos = 0;
-    U32 P32 = (U32)(32768 * proba);
-
-    // First Byte
-    BBuffer[pos++] = (BYTE)((FUZ_rand(seed) & 0x3F) + '0');
-
-    while (pos < bufferSize)
-    {
-        // Select : Literal (noise) or copy (within 64K)
-        if (FUZ_RAND15BITS < P32)
-        {
-            // Copy (within 64K)
-            size_t match, end;
-            size_t length = FUZ_RANDLENGTH + 4;
-            size_t offset = FUZ_RAND15BITS + 1;
-            if (offset > pos) offset = pos;
-            if (pos + length > bufferSize) length = bufferSize - pos;
-            match = pos - offset;
-            end = pos + length;
-            while (pos < end) BBuffer[pos++] = BBuffer[match++];
-        }
-        else
-        {
-            // Literal (noise)
-            size_t end;
-            size_t length = FUZ_RANDLENGTH;
-            if (pos + length > bufferSize) length = bufferSize - pos;
-            end = pos + length;
-            while (pos < end) BBuffer[pos++] = (BYTE)((FUZ_rand(seed) & 0x3F) + '0');
-        }
-    }
-}
-
-
-/*
-static unsigned FUZ_highbit(U32 v32)
+static unsigned FUZ_highbit32(U32 v32)
 {
     unsigned nbBits = 0;
     if (v32==0) return 0;
@@ -189,7 +150,6 @@ static unsigned FUZ_highbit(U32 v32)
     }
     return nbBits;
 }
-*/
 
 
 static int basicUnitTests(U32 seed, double compressibility)
@@ -202,7 +162,7 @@ static int basicUnitTests(U32 seed, double compressibility)
     size_t result, cSize;
     U32 testNb=0;
 
-    // Create compressible test buffer
+    /* Create compressible test buffer */
     CNBuffer = malloc(COMPRESSIBLE_NOISE_LENGTH);
     compressedBuffer = malloc(ZSTD_compressBound(COMPRESSIBLE_NOISE_LENGTH));
     decodedBuffer = malloc(COMPRESSIBLE_NOISE_LENGTH);
@@ -212,9 +172,9 @@ static int basicUnitTests(U32 seed, double compressibility)
         testResult = 1;
         goto _end;
     }
-    FUZ_generateSynthetic(CNBuffer, COMPRESSIBLE_NOISE_LENGTH, compressibility, &randState);
+    RDG_genBuffer(CNBuffer, COMPRESSIBLE_NOISE_LENGTH, compressibility, 0., randState);
 
-    // Basic tests
+    /* Basic tests */
     DISPLAYLEVEL(4, "test%3i : compress %u bytes : ", testNb++, COMPRESSIBLE_NOISE_LENGTH);
     result = ZSTD_compress(compressedBuffer, ZSTD_compressBound(COMPRESSIBLE_NOISE_LENGTH), CNBuffer, COMPRESSIBLE_NOISE_LENGTH);
     if (ZSTD_isError(result)) goto _output_error;
@@ -239,37 +199,36 @@ static int basicUnitTests(U32 seed, double compressibility)
     DISPLAYLEVEL(4, "test%3i : decompress with 1 missing byte : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize-1);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     DISPLAYLEVEL(4, "test%3i : decompress with 1 too much byte : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, compressedBuffer, cSize+1);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     /* Decompression defense tests */
     DISPLAYLEVEL(4, "test%3i : Check input length for magic number : ", testNb++);
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, CNBuffer, 3);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongSrcSize) goto _output_error;
+    if (result != (size_t)-ZSTD_ERROR_SrcSize) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     DISPLAYLEVEL(4, "test%3i : Check magic Number : ", testNb++);
     ((char*)(CNBuffer))[0] = 1;
     result = ZSTD_decompress(decodedBuffer, COMPRESSIBLE_NOISE_LENGTH, CNBuffer, 4);
     if (!ZSTD_isError(result)) goto _output_error;
-    if (result != (size_t)-ZSTD_ERROR_wrongMagicNumber) goto _output_error;
     DISPLAYLEVEL(4, "OK \n");
 
     /* long rle test */
     {
         size_t sampleSize = 0;
         DISPLAYLEVEL(4, "test%3i : Long RLE test : ", testNb++);
-        FUZ_generateSynthetic(CNBuffer, sampleSize, compressibility, &randState);
+        RDG_genBuffer(CNBuffer, sampleSize, compressibility, 0., randState);
         memset((char*)CNBuffer+sampleSize, 'B', 256 KB - 1);
         sampleSize += 256 KB - 1;
-        FUZ_generateSynthetic((char*)CNBuffer+sampleSize, 96 KB, compressibility, &randState);
+        RDG_genBuffer((char*)CNBuffer+sampleSize, 96 KB, compressibility, 0., randState);
         sampleSize += 96 KB;
         cSize = ZSTD_compress(compressedBuffer, ZSTD_compressBound(sampleSize), CNBuffer, sampleSize);
         if (ZSTD_isError(cSize)) goto _output_error;
@@ -314,6 +273,7 @@ static const U32 maxSampleLog = 22;
 
 int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibility)
 {
+    BYTE* cNoiseBuffer[5];
     BYTE* srcBuffer;
     BYTE* cBuffer;
     BYTE* dstBuffer;
@@ -323,54 +283,172 @@ int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, double compressibilit
     U32 result = 0;
     U32 testNb = 0;
     U32 coreSeed = seed, lseed = 0;
-    (void)startTest; (void)compressibility;
 
     /* allocation */
-    srcBuffer = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[0] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[1] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[2] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[3] = (BYTE*)malloc (srcBufferSize);
+    cNoiseBuffer[4] = (BYTE*)malloc (srcBufferSize);
     dstBuffer = (BYTE*)malloc (dstBufferSize);
     cBuffer   = (BYTE*)malloc (cBufferSize);
-    CHECK (!srcBuffer || !dstBuffer || !cBuffer, "Not enough memory, fuzzer tests cancelled");
+    CHECK (!cNoiseBuffer[0] || !cNoiseBuffer[1] || !cNoiseBuffer[2] || !dstBuffer || !cBuffer,
+           "Not enough memory, fuzzer tests cancelled");
 
-    /* Create initial sample */
-    FUZ_generateSynthetic(srcBuffer, srcBufferSize, 0.50, &coreSeed);
+    /* Create initial samples */
+    RDG_genBuffer(cNoiseBuffer[0], srcBufferSize, 0.00, 0., coreSeed);    /* pure noise */
+    RDG_genBuffer(cNoiseBuffer[1], srcBufferSize, 0.05, 0., coreSeed);    /* barely compressible */
+    RDG_genBuffer(cNoiseBuffer[2], srcBufferSize, compressibility, 0., coreSeed);
+    RDG_genBuffer(cNoiseBuffer[3], srcBufferSize, 0.95, 0., coreSeed);    /* highly compressible */
+    RDG_genBuffer(cNoiseBuffer[4], srcBufferSize, 1.00, 0., coreSeed);    /* sparse content */
+    srcBuffer = cNoiseBuffer[2];
 
     /* catch up testNb */
-    for (testNb=0; testNb < startTest; testNb++)
+    for (testNb=1; testNb < startTest; testNb++)
         FUZ_rand(&coreSeed);
 
     /* test loop */
-    for (testNb=startTest; testNb < nbTests; testNb++)
+    for ( ; testNb <= nbTests; testNb++ )
     {
         size_t sampleSize, sampleStart;
         size_t cSize, dSize, dSupSize;
-        U32 sampleSizeLog;
+        U32 sampleSizeLog, buffNb;
         U64 crcOrig, crcDest;
 
         /* init */
         DISPLAYUPDATE(2, "\r%6u/%6u   ", testNb, nbTests);
         FUZ_rand(&coreSeed);
         lseed = coreSeed ^ prime1;
+        buffNb = FUZ_rand(&lseed) & 127;
+        if (buffNb & 7) buffNb=2;
+        else
+        {
+            buffNb >>= 3;
+            if (buffNb & 7)
+            {
+                const U32 tnb[2] = { 1, 3 };
+                buffNb = tnb[buffNb >> 3];
+            }
+            else
+            {
+                const U32 tnb[2] = { 0, 4 };
+                buffNb = tnb[buffNb >> 3];
+            }
+        }
+        srcBuffer = cNoiseBuffer[buffNb];
         sampleSizeLog = FUZ_rand(&lseed) % maxSampleLog;
-        sampleSize = (size_t)1<<sampleSizeLog;
+        sampleSize = (size_t)1 << sampleSizeLog;
         sampleSize += FUZ_rand(&lseed) & (sampleSize-1);
         sampleStart = FUZ_rand(&lseed) % (srcBufferSize - sampleSize);
         crcOrig = XXH64(srcBuffer + sampleStart, sampleSize, 0);
 
-        /* compression tests*/
+        /* compression test */
         cSize = ZSTD_compress(cBuffer, cBufferSize, srcBuffer + sampleStart, sampleSize);
         CHECK(ZSTD_isError(cSize), "ZSTD_compress failed");
 
-        /* decompression tests*/
+        /* compression failure test : too small dest buffer */
+        if (cSize > 3)
+        {
+            size_t errorCode;
+            const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = cSize - missing;
+            static const U32 endMark = 0x4DC2B1A9;
+            U32 endCheck;
+            memcpy(dstBuffer+tooSmallSize, &endMark, 4);
+            errorCode = ZSTD_compress(dstBuffer, tooSmallSize, srcBuffer + sampleStart, sampleSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_compress should have failed ! (buffer too small)");
+            memcpy(&endCheck, dstBuffer+tooSmallSize, 4);
+            CHECK(endCheck != endMark, "ZSTD_compress : dst buffer overflow");
+        }
+
+        /* successfull decompression tests*/
         dSupSize = (FUZ_rand(&lseed) & 1) ? 0 : (FUZ_rand(&lseed) & 31) + 1;
         dSize = ZSTD_decompress(dstBuffer, sampleSize + dSupSize, cBuffer, cSize);
         CHECK(dSize != sampleSize, "ZSTD_decompress failed (%s)", ZSTD_getErrorName(dSize));
         crcDest = XXH64(dstBuffer, sampleSize, 0);
         CHECK(crcOrig != crcDest, "dstBuffer corrupted (pos %u / %u)", (U32)findDiff(srcBuffer+sampleStart, dstBuffer, sampleSize), (U32)sampleSize);
+
+        /* truncated src decompression test */
+        {
+            size_t errorCode;
+            const size_t missing = (FUZ_rand(&lseed) % (cSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = cSize - missing;
+            void* cBufferTooSmall = malloc(tooSmallSize);   /* valgrind will catch overflows */
+            CHECK(cBufferTooSmall == NULL, "not enough memory !");
+            memcpy(cBufferTooSmall, cBuffer, tooSmallSize);
+            errorCode = ZSTD_decompress(dstBuffer, dstBufferSize, cBufferTooSmall, tooSmallSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed ! (truncated src buffer)");
+            free(cBufferTooSmall);
+        }
+
+        /* too small dst decompression test */
+        if (sampleSize > 3)
+        {
+            size_t errorCode;
+            const size_t missing = (FUZ_rand(&lseed) % (sampleSize-2)) + 1;   /* no problem, as cSize > 4 (frameHeaderSizer) */
+            const size_t tooSmallSize = sampleSize - missing;
+            static const BYTE token = 0xA9;
+            dstBuffer[tooSmallSize] = token;
+            errorCode = ZSTD_decompress(dstBuffer, tooSmallSize, cBuffer, cSize);
+            CHECK(!ZSTD_isError(errorCode), "ZSTD_decompress should have failed : %u > %u (dst buffer too small)", (U32)errorCode, (U32)tooSmallSize);
+            CHECK(dstBuffer[tooSmallSize] != token, "ZSTD_decompress : dst buffer overflow");
+        }
+
+        /* noisy src decompression test */
+        if (cSize > 6)
+        {
+            const U32 maxNbBits = FUZ_highbit32((U32)(cSize-4));
+            size_t pos = 4;   /* preserve magic number (too easy to detect) */
+            U32 nbBits = FUZ_rand(&lseed) % maxNbBits;
+            size_t mask = (1<<nbBits) - 1;
+            size_t skipLength = FUZ_rand(&lseed) & mask;
+            pos += skipLength;
+
+            while (pos < cSize)
+            {
+                /* add noise */
+                size_t noiseStart, noiseLength;
+                nbBits = FUZ_rand(&lseed) % maxNbBits;
+                if (nbBits>0) nbBits--;
+                mask = (1<<nbBits) - 1;
+                noiseLength = (FUZ_rand(&lseed) & mask) + 1;
+                if ( pos+noiseLength > cSize ) noiseLength = cSize-pos;
+                noiseStart = FUZ_rand(&lseed) % (srcBufferSize - noiseLength);
+                memcpy(cBuffer + pos, srcBuffer + noiseStart, noiseLength);
+                pos += noiseLength;
+
+                /* keep some original src */
+                nbBits = FUZ_rand(&lseed) % maxNbBits;
+                mask = (1<<nbBits) - 1;
+                skipLength = FUZ_rand(&lseed) & mask;
+                pos += skipLength;
+            }
+
+            /* decompress noisy source */
+            {
+                U32 noiseSrc = FUZ_rand(&lseed) % 5;
+                const U32 endMark = 0xA9B1C3D6;
+                U32 endCheck;
+                size_t errorCode;
+                srcBuffer = cNoiseBuffer[noiseSrc];
+                memcpy(dstBuffer+sampleSize, &endMark, 4);
+                errorCode = ZSTD_decompress(dstBuffer, sampleSize, cBuffer, cSize);
+                /* result *may* be an unlikely success, but even then, it must strictly respect dest buffer boundaries */
+                CHECK((!ZSTD_isError(errorCode)) && (errorCode>sampleSize),
+                      "ZSTD_decompress on noisy src : result is too large : %u > %u (dst buffer)", (U32)errorCode, (U32)sampleSize);
+                memcpy(&endCheck, dstBuffer+sampleSize, 4);
+                CHECK(endMark!=endCheck, "ZSTD_decompress on noisy src : dst buffer overflow");
+            }
+        }
     }
     DISPLAY("\rAll fuzzer tests completed   \n");
 
 _cleanup:
-    free(srcBuffer);
+    free(cNoiseBuffer[0]);
+    free(cNoiseBuffer[1]);
+    free(cNoiseBuffer[2]);
+    free(cNoiseBuffer[3]);
+    free(cNoiseBuffer[4]);
     free(cBuffer);
     free(dstBuffer);
     return result;
@@ -393,8 +471,9 @@ int FUZ_usage(char* programName)
     DISPLAY( " -i#    : Nb of tests (default:%u) \n", nbTestsDefault);
     DISPLAY( " -s#    : Select seed (default:prompt user)\n");
     DISPLAY( " -t#    : Select starting test number (default:0)\n");
-    DISPLAY( " -p#    : Select compressibility in %% (default:%i%%)\n", FUZ_COMPRESSIBILITY_DEFAULT);
+    DISPLAY( " -P#    : Select compressibility in %% (default:%i%%)\n", FUZ_COMPRESSIBILITY_DEFAULT);
     DISPLAY( " -v     : verbose\n");
+    DISPLAY( " -p     : pause at the end\n");
     DISPLAY( " -h     : display help and exit\n");
     return 0;
 }
diff --git a/programs/xxhash.c b/programs/xxhash.c
index e6fb8f14..511d9941 100644
--- a/programs/xxhash.c
+++ b/programs/xxhash.c
@@ -35,13 +35,26 @@ You can contact the author at :
 /**************************************
 *  Tuning parameters
 **************************************/
-/* Unaligned memory access is automatically enabled for "common" CPU, such as x86.
- * For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
- * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
- * You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
+/* XXH_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which generate assembly depending on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USE_UNALIGNED_ACCESS 1
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
 /* XXH_ACCEPT_NULL_INPUT_POINTER :
@@ -55,12 +68,21 @@ You can contact the author at :
  * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
  * Results are therefore identical for little-endian and big-endian CPU.
  * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independance be of no importance for your application, you may set the #define below to 1.
- * It will improve speed for Big-endian CPU.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
  * This option has no impact on Little_Endian CPU.
  */
 #define XXH_FORCE_NATIVE_FORMAT 0
 
+/* XXH_USELESS_ALIGN_BRANCH :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : don't make a test between aligned/unaligned, because performance will be the same.
+ * It saves one initial branch per hash.
+ */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#  define XXH_USELESS_ALIGN_BRANCH 1
+#endif
+
 
 /**************************************
 *  Compiler Specific Options
@@ -113,20 +135,43 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
   typedef unsigned long long U64;
 #endif
 
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
 static U32 XXH_read32(const void* memPtr)
 {
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
 static U64 XXH_read64(const void* memPtr)
 {
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
+#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
 
 
 /******************************************
@@ -175,8 +220,10 @@ static U64 XXH_swap64 (U64 x)
 *  Architecture Macros
 ***************************************/
 typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-#ifndef XXH_CPU_LITTLE_ENDIAN   /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example using a compiler switch */
-static const int one = 1;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example one the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int one = 1;
 #   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&one))
 #endif
 
@@ -315,7 +362,7 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
 }
 
 
-unsigned XXH32 (const void* input, size_t len, unsigned seed)
+unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
 {
 #if 0
     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
@@ -326,7 +373,7 @@ unsigned XXH32 (const void* input, size_t len, unsigned seed)
 #else
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
     if ((((size_t)input) & 3) == 0)   /* Input is 4-bytes aligned, leverage the speed benefit */
     {
         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
@@ -466,7 +513,7 @@ unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed
 #else
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
+#  if !defined(XXH_USELESS_ALIGN_BRANCH)
     if ((((size_t)input) & 7)==0)   /* Input is aligned, let's leverage the speed advantage */
     {
         if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
@@ -538,7 +585,7 @@ XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
 
 /*** Hash feed ***/
 
-XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed)
+XXH_errorcode XXH32_reset(XXH32_state_t* state_in, unsigned int seed)
 {
     XXH_istate32_t* state = (XXH_istate32_t*) state_in;
     state->seed = seed;
@@ -708,7 +755,7 @@ FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endiane
 }
 
 
-U32 XXH32_digest (const XXH32_state_t* state_in)
+unsigned int XXH32_digest (const XXH32_state_t* state_in)
 {
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
 
diff --git a/visual/2012/fuzzer/fuzzer.vcxproj b/visual/2012/fuzzer/fuzzer.vcxproj
index 8a114aef..f80b3b85 100644
--- a/visual/2012/fuzzer/fuzzer.vcxproj
+++ b/visual/2012/fuzzer/fuzzer.vcxproj
@@ -161,6 +161,7 @@
   <ItemGroup>
     <ClCompile Include="..\..\..\lib\fse.c" />
     <ClCompile Include="..\..\..\lib\zstd.c" />
+    <ClCompile Include="..\..\..\programs\datagen.c" />
     <ClCompile Include="..\..\..\programs\fuzzer.c" />
     <ClCompile Include="..\..\..\programs\xxhash.c" />
   </ItemGroup>
@@ -169,6 +170,7 @@
     <ClInclude Include="..\..\..\lib\fse_static.h" />
     <ClInclude Include="..\..\..\lib\zstd.h" />
     <ClInclude Include="..\..\..\lib\zstd_static.h" />
+    <ClInclude Include="..\..\..\programs\datagen.h" />
     <ClInclude Include="..\..\..\programs\xxhash.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/visual/2012/fuzzer/fuzzer.vcxproj.filters b/visual/2012/fuzzer/fuzzer.vcxproj.filters
index 7782b08f..6bc481a6 100644
--- a/visual/2012/fuzzer/fuzzer.vcxproj.filters
+++ b/visual/2012/fuzzer/fuzzer.vcxproj.filters
@@ -27,6 +27,9 @@
     <ClCompile Include="..\..\..\programs\xxhash.c">
       <Filter>Fichiers sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\programs\datagen.c">
+      <Filter>Fichiers sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\lib\fse.h">
@@ -44,5 +47,8 @@
     <ClInclude Include="..\..\..\programs\xxhash.h">
       <Filter>Fichiers d%27en-tête</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\programs\datagen.h">
+      <Filter>Fichiers d%27en-tête</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file