From 9700f925837d819896ee02bd7362b718e6f13916 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Mon, 30 Jan 2017 11:42:45 -0800
Subject: [PATCH 01/15] Add educational decoder to /contrib

---
 contrib/educational_decoder/README.md         |   18 +
 contrib/educational_decoder/harness.c         |   93 +
 contrib/educational_decoder/zstd_decompress.c | 2096 +++++++++++++++++
 contrib/educational_decoder/zstd_decompress.h |    6 +
 4 files changed, 2213 insertions(+)
 create mode 100644 contrib/educational_decoder/README.md
 create mode 100644 contrib/educational_decoder/harness.c
 create mode 100644 contrib/educational_decoder/zstd_decompress.c
 create mode 100644 contrib/educational_decoder/zstd_decompress.h
diff --git a/contrib/educational_decoder/README.md b/contrib/educational_decoder/README.md
new file mode 100644
index 00000000..a1f703f6
--- /dev/null
+++ b/contrib/educational_decoder/README.md
@@ -0,0 +1,18 @@
+Educational Decoder
+===================
+
+`zstd_decompress.c` is a self-contained implementation of a decoder according
+to the Zstandard format specification written in C99.
+While it does not implement as many features as the reference decoder,
+such as the streaming API or content checksums, it is written to be easy to
+follow and understand, to help understand how the Zstandard format works.
+It's laid out to match the [format specification],
+so it can be used to understand how confusing segments could be implemented.
+It also contains implementations of Huffman and FSE table decoding.
+
+[format specification]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
+
+`harness.c` provides a simple test harness around the decoder:
+
+    harness <input-file> <output-file> [dictionary]
+
diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
new file mode 100644
index 00000000..6f4765d9
--- /dev/null
+++ b/contrib/educational_decoder/harness.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "zstd_decompress.h"
+
+typedef unsigned char u8;
+
+// There's no good way to determine output size without decompressing
+// For this example assume we'll never decompress at a ratio larger than 16
+#define MAX_COMPRESSION_RATIO (16)
+
+u8 *input;
+u8 *output;
+u8 *dict;
+
+size_t read_file(const char *path, u8 **ptr) {
+  FILE *f = fopen(path, "rb");
+  if (!f) {
+    fprintf(stderr, "failed to open file %s\n", path);
+    exit(1);
+  }
+
+  fseek(f, 0L, SEEK_END);
+  size_t size = ftell(f);
+  rewind(f);
+
+  *ptr = malloc(size);
+  if (!ptr) {
+    fprintf(stderr, "failed to allocate memory to hold %s\n", path);
+    exit(1);
+  }
+
+  size_t pos = 0;
+  while (!feof(f)) {
+    size_t read = fread(&(*ptr)[pos], 1, size, f);
+    if (ferror(f)) {
+      fprintf(stderr, "error while reading file %s\n", path);
+      exit(1);
+    }
+    pos += read;
+  }
+
+  fclose(f);
+
+  return pos;
+}
+
+void write_file(const char *path, const u8 *ptr, size_t size) {
+  FILE *f = fopen(path, "wb");
+
+  size_t written = 0;
+  while (written < size) {
+    written += fwrite(&ptr[written], 1, size, f);
+    if (ferror(f)) {
+      fprintf(stderr, "error while writing file %s\n", path);
+      exit(1);
+    }
+  }
+
+  fclose(f);
+}
+
+int main(int argc, char **argv) {
+  if (argc < 3) {
+    fprintf(stderr, "usage: %s <file.zst> <out_path> [dictionary]\n", argv[0]);
+
+    return 1;
+  }
+
+  size_t input_size = read_file(argv[1], &input);
+  size_t dict_size = 0;
+  if (argc >= 4) {
+    dict_size = read_file(argv[3], &dict);
+  }
+
+  output = malloc(MAX_COMPRESSION_RATIO * input_size);
+  if (!output) {
+    fprintf(stderr, "failed to allocate memory\n");
+    return 1;
+  }
+
+  size_t decompressed =
+      ZSTD_decompress_with_dict(output, input_size * MAX_COMPRESSION_RATIO,
+                                input, input_size, dict, dict_size);
+
+  write_file(argv[2], output, decompressed);
+
+  free(input);
+  free(output);
+  free(dict);
+  input = output = dict = NULL;
+}
+
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
new file mode 100644
index 00000000..8dc15900
--- /dev/null
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -0,0 +1,2096 @@
+/// Zstandard educational decoder implementation
+/// See https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/// Zstandard decompression functions.
+/// `dst` must point to a space at least as large as the reconstructed output.
+size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
+                       size_t src_len);
+/// If `dict != NULL` and `dict_len >= 8`, does the same thing as
+/// `ZSTD_decompress` but uses the provided dict
+size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
+                                 size_t src_len, const void *dict,
+                                 size_t dict_len);
+
+/******* UTILITY MACROS AND TYPES *********************************************/
+#define MAX_WINDOW_SIZE ((size_t)512 << 20)
+// Max block size decompressed size is 128 KB and literal blocks must be smaller
+// than that
+#define MAX_LITERALS_SIZE ((size_t)(1024 * 128))
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define ERROR(s)                                                               \
+    do {                                                                       \
+        fprintf(stderr, "Error: %s\n", s);                                     \
+        exit(1);                                                               \
+    } while (0)
+#define INP_SIZE()                                                             \
+    ERROR("Input buffer smaller than it should be or input is "                \
+          "corrupted")
+#define OUT_SIZE() ERROR("Output buffer too small for output")
+#define CORRUPTION() ERROR("Corruption detected while decompressing")
+#define BAD_ALLOC() ERROR("Memory allocation error")
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef int32_t i32;
+typedef int64_t i64;
+/******* END UTILITY MACROS AND TYPES *****************************************/
+
+/******* IMPLEMENTATION PRIMITIVE PROTOTYPES **********************************/
+/// The implementations for these functions can be found at the bottom of this
+/// file.  They implement low-level functionality needed for the higher level
+/// decompression functions.
+
+/*** CIRCULAR BUFFER ******************/
+/// A standard circular buffer, used to facilitate back reference commands
+typedef struct {
+    u8 *ptr;
+    size_t idx, last_flush, size;
+} cbuf_t;
+
+/// Initialize a circular buffer
+static void cbuf_init(cbuf_t *buf, size_t size);
+static void cbuf_free(cbuf_t *buf);
+
+/// Copies up to `src_len` bytes from `src` into the buffer, stopping if it
+/// would need to flush.
+/// Returns the total amount of data copied.
+static size_t cbuf_write_data(cbuf_t *buf, const u8 *src, size_t src_len);
+/// Copies `len` bytes from `offset` back in the buffer, stopping if it would
+/// need to flush.
+/// Returns the number of bytes copied.
+static size_t cbuf_copy_offset(cbuf_t *buf, size_t offset, size_t len);
+/// Writes up to `len` copies of `byte`, stopping if would need to flush.
+/// Returns the number of bytes copied.
+static size_t cbuf_repeat_byte(cbuf_t *buf, u8 byte, size_t len);
+
+/// The `full` versions of the above functions write the full amount requested,
+/// flushing to `out` when necessary.
+/// They return the number of bytes flushed to `out`, if any.
+static size_t cbuf_write_data_full(cbuf_t *buf, const u8 *src, size_t src_len,
+                                   u8 *out, size_t out_len);
+static size_t cbuf_copy_offset_full(cbuf_t *buf, size_t offset, size_t len,
+                                    u8 *out, size_t out_len);
+static size_t cbuf_repeat_byte_full(cbuf_t *buf, u8 byte, size_t len, u8 *out,
+                                    size_t out_len);
+
+/// Flushes any unflushed data to `dst`
+static size_t cbuf_flush(cbuf_t *buf, u8 *dst, size_t dst_len);
+/*** END CIRCULAR BUFFER **************/
+
+/*** BITSTREAM OPERATIONS *************/
+/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
+static inline u64 read_bits_LE(const u8 *src, int num, size_t offset);
+
+/// Read bits from the end of a HUF or FSE bitstream.  `offset` is in bits, so
+/// it updates `offset` to `offset - bits`, and then reads `bits` bits from
+/// `src + offset`.  If the offset becomes negative, the extra bits at the
+/// bottom are filled in with `0` bits instead of reading from before `src`.
+static inline u64 STREAM_read_bits(const u8 *src, int bits, i64 *offset);
+/*** END BITSTREAM OPERATIONS *********/
+
+/*** BIT COUNTING OPERATIONS **********/
+/// Returns `x`, where `2^x` is the smallest power of 2 greater than or equal to
+/// `num`, or `-1` if `num > 2^63`
+static inline int log2sup(u64 num);
+
+/// Returns `x`, where `2^x` is the largest power of 2 less than or equal to
+/// `num`, or `-1` if `num == 0`.
+static inline int log2inf(u64 num);
+/*** END BIT COUNTING OPERATIONS ******/
+
+/*** HUFFMAN PRIMITIVES ***************/
+// Table decode method uses exponential memory, so we need to limit depth
+#define HUF_MAX_BITS (16)
+
+// Limit the maximum number of symbols to 256 so we can store a symbol in a byte
+#define HUF_MAX_SYMBS (256)
+
+/// Structure containing all tables necessary for efficient Huffman decoding
+typedef struct {
+    u8 *symbols;
+    u8 *num_bits;
+    int max_bits;
+} HUF_dtable;
+
+/// Decode a single symbol and read in enough bits to refresh the state
+static inline u8 HUF_decode_symbol(HUF_dtable *dtable, u16 *state,
+                                   const u8 *src, i64 *offset);
+/// Read in a full state's worth of bits to initialize it
+static inline void HUF_init_state(HUF_dtable *dtable, u16 *state, const u8 *src,
+                                  i64 *offset);
+
+/// Initialize a Huffman decoding table using the table of bit counts provided
+static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs);
+/// Initialize a Huffman decoding table using the table of weights provided
+/// Weights follow the definition provided in the Zstandard specification
+static void HUF_init_dtable_usingweights(HUF_dtable *table, u8 *weights,
+                                         int num_symbs);
+
+/// Decompresses a single Huffman stream, returns the number of bytes decoded.
+/// `src_len` must be the exact length of the Huffman-coded block.
+static size_t HUF_decompress_1stream(HUF_dtable *table, u8 *dst, size_t dst_len,
+                                     const u8 *src, size_t src_len);
+/// Same as previous but decodes 4 streams, formatted as in the Zstandard
+/// specification.
+/// `src_len` must be the exact length of the Huffman-coded block.
+static size_t HUF_decompress_4stream(HUF_dtable *dtable, u8 *dst,
+                                     size_t dst_len, const u8 *src,
+                                     size_t src_len);
+
+/// Free the malloc'ed parts of a decoding table
+static void HUF_free_dtable(HUF_dtable *dtable);
+
+/// Deep copy a decoding table, so that it can be used and free'd without
+/// impacting the source table.
+static void HUF_copy_dtable(HUF_dtable *dst, const HUF_dtable *src);
+/*** END HUFFMAN PRIMITIVES ***********/
+
+/*** FSE PRIMITIVES *******************/
+/// For more description of FSE see
+/// https://github.com/Cyan4973/FiniteStateEntropy/
+
+// FSE table decoding uses exponential memory, so limit the maximum accuracy
+#define FSE_MAX_ACCURACY_LOG (15)
+// Limit the maximum number of symbols so they can be stored in a single byte
+#define FSE_MAX_SYMBS (256)
+
+/// The tables needed to decode FSE encoded streams
+typedef struct {
+    u8 *symbols;
+    u8 *num_bits;
+    u16 *new_state_base;
+    int accuracy_log;
+} FSE_dtable;
+
+/// Return the symbol for the current state
+static inline u8 FSE_peek_symbol(FSE_dtable *dtable, u16 state);
+/// Read the number of bits necessary to update state, update, and shift offset
+/// back to reflect the bits read
+static inline void FSE_update_state(FSE_dtable *dtable, u16 *state,
+                                    const u8 *src, i64 *offset);
+
+/// Combine peek and update: decode a symbol and update the state
+static inline u8 FSE_decode_symbol(FSE_dtable *dtable, u16 *state,
+                                   const u8 *src, i64 *offset);
+
+/// Read bits from the stream to initialize the state and shift offset back
+static inline void FSE_init_state(FSE_dtable *dtable, u16 *state, const u8 *src,
+                                  i64 *offset);
+
+/// Decompress two interleaved bitstreams (e.g. compressed Huffman weights)
+/// using an FSE decoding table.  `src_len` must be the exact length of the
+/// block.
+static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
+                                          size_t dst_len, const u8 *src,
+                                          size_t src_len);
+
+/// Initialize a decoding table using normalized frequencies.
+static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
+                            int num_symbs, int accuracy_log);
+
+/// Decode an FSE header as defined in the Zstandard format specification and
+/// use the decoded frequencies to initialize a decoding table.
+static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
+                                size_t src_len, int max_accuracy_log);
+
+/// Initialize an FSE table that will always return the same symbol and consume
+/// 0 bits per symbol, to be used for RLE mode in sequence commands
+static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb);
+
+/// Free the malloc'ed parts of a decoding table
+static void FSE_free_dtable(FSE_dtable *dtable);
+
+/// Deep copy a decoding table, so that it can be used and free'd without
+/// impacting the source table.
+static void FSE_copy_dtable(FSE_dtable *dst, const FSE_dtable *src);
+/*** END FSE PRIMITIVES ***************/
+
+/******* END IMPLEMENTATION PRIMITIVE PROTOTYPES ******************************/
+
+/******* ZSTD HELPER STRUCTS AND PROTOTYPES ***********************************/
+
+/// Input and output pointers to allow them to be advanced by
+/// functions that consume input/produce output
+typedef struct {
+    u8 *dst;
+    size_t dst_len;
+
+    const u8 *src;
+    size_t src_len;
+} io_streams_t;
+
+/// The context needed to decode blocks in a frame
+typedef struct {
+    size_t window_size;
+    size_t frame_content_size;
+
+    // The total amount of data available for backreferences, to determine if an
+    // offset too large to be correct
+    size_t current_total_output;
+
+    // A sliding window of the past `window_size` bytes decoded
+    cbuf_t window;
+
+    // Entropy encoding tables so they can be repeated by future blocks instead
+    // of
+    // retransmitting
+    HUF_dtable literals_dtable;
+    FSE_dtable ll_dtable;
+    FSE_dtable ml_dtable;
+    FSE_dtable of_dtable;
+
+    // The last 3 offsets for the special "repeat offsets".  Array size is 4 so
+    // that previous_offsets[1] corresponds to the most recent offset
+    u64 previous_offsets[4];
+
+    // The dictionary id for this frame if one exists
+    u32 dictionary_id;
+
+    int single_segment_flag;
+    int content_checksum_flag;
+} frame_context_t;
+
+/// The decoded contents of a dictionary so that it doesn't have to be repeated
+/// for each frame that uses it
+typedef struct {
+    // Entropy tables
+    HUF_dtable literals_dtable;
+    FSE_dtable ll_dtable;
+    FSE_dtable ml_dtable;
+    FSE_dtable of_dtable;
+
+    // Raw content for backreferences
+    u8 *content;
+    size_t content_size;
+
+    // Offset history to prepopulate the frame's history
+    u64 previous_offsets[4];
+
+    u32 dictionary_id;
+} dictionary_t;
+
+/// A tuple containing the parts necessary to decode and execute a ZSTD sequence
+/// command
+typedef struct {
+    u32 literal_length;
+    u32 match_length;
+    u32 offset;
+} sequence_command_t;
+
+/// The decoder works top-down, starting at the high level like Zstd frames, and
+/// working down to lower more technical levels such as blocks, literals, and
+/// sequences.  The high-level functions roughly follow the outline of the
+/// format specification:
+/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
+
+/// Before the implementation of each high-level function declared here, the
+/// prototypes for their helper functions are defined and explained
+
+/// Decode a single Zstd frame, or error if the input is not a valid frame.
+/// Accepts a dict argument, which may be NULL indicating no dictionary.
+/// See
+/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame-concatenation
+static void decode_frame(io_streams_t *streams, dictionary_t *dict);
+
+// Decode data in a compressed block
+static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
+                             size_t block_len);
+
+// Decode the literals section of a block
+static size_t decode_literals(io_streams_t *streams, frame_context_t *ctx,
+                              u8 **literals);
+
+// Decode the sequences part of a block
+static size_t decode_sequences(frame_context_t *ctx, const u8 *src,
+                               size_t src_len, sequence_command_t **sequences);
+
+// Execute the decoded sequences on the literals block
+static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
+                                sequence_command_t *sequences,
+                                size_t num_sequences, const u8 *literals,
+                                size_t literals_len);
+
+// Parse a provided dictionary blob for use in decompression
+static void parse_dictionary(dictionary_t *dict, const u8 *src, size_t src_len);
+static void free_dictionary(dictionary_t *dict);
+/******* END ZSTD HELPER STRUCTS AND PROTOTYPES *******************************/
+
+size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
+                       size_t src_len) {
+    return ZSTD_decompress_with_dict(dst, dst_len, src, src_len, NULL, 0);
+}
+
+size_t ZSTD_decompress_usingDict(void *_ctx, void *dst, size_t dst_len,
+                                 const void *src, size_t src_len,
+                                 const void *dict, size_t dict_len) {
+    // _ctx needed to match ZSTD lib signature
+    return ZSTD_decompress_with_dict(dst, dst_len, src, src_len, dict,
+                                     dict_len);
+}
+
+size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
+                                 size_t src_len, const void *dict,
+                                 size_t dict_len) {
+    dictionary_t parsed_dict;
+    memset(&parsed_dict, 0, sizeof(dictionary_t));
+    // dict_len < 8 is not a valid dictionary
+    if (dict && dict_len > 8) {
+        parse_dictionary(&parsed_dict, (const u8 *)dict, dict_len);
+    }
+
+    io_streams_t streams = {(u8 *)dst, dst_len, (const u8 *)src, src_len};
+    while (streams.src_len > 0) {
+        decode_frame(&streams, &parsed_dict);
+    }
+
+    free_dictionary(&parsed_dict);
+
+    return streams.dst - (u8 *)dst;
+}
+
+/******* FRAME DECODING ******************************************************/
+
+static void decode_data_frame(io_streams_t *streams, dictionary_t *dict);
+static void init_frame_context(frame_context_t *context);
+static void free_frame_context(frame_context_t *context);
+static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
+                               dictionary_t *dict);
+static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict);
+
+static void decompress_data(io_streams_t *streams, frame_context_t *ctx);
+
+static void decode_frame(io_streams_t *streams, dictionary_t *dict) {
+    if (streams->src_len < 4) {
+        INP_SIZE();
+    }
+    u32 magic_number = read_bits_LE(streams->src, 32, 0);
+
+    streams->src += 4;
+    streams->src_len -= 4;
+    if (magic_number >= 0x184D2A50U && magic_number <= 0x184D2A5F) {
+        // skippable frame
+        if (streams->src_len < 4) {
+            INP_SIZE();
+        }
+        size_t frame_size = read_bits_LE(streams->src, 32, 32);
+
+        if (streams->src_len < 4 + frame_size) {
+            INP_SIZE();
+        }
+
+        // skip over frame
+        streams->src += 4 + frame_size;
+        streams->src_len -= 4 + frame_size;
+    } else if (magic_number == 0xFD2FB528U) {
+        // ZSTD frame
+        decode_data_frame(streams, dict);
+    } else {
+        // not a real frame
+        ERROR("Invalid magic number");
+    }
+}
+
+/// Decode a frame that contains compressed data.  Not all frames do as there
+/// are skippable frames.
+/// See
+/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#general-structure-of-zstandard-frame-format
+static void decode_data_frame(io_streams_t *streams, dictionary_t *dict) {
+    frame_context_t ctx;
+
+    // Initialize the context that needs to be carried from block to block
+    init_frame_context(&ctx);
+    parse_frame_header(streams, &ctx, dict);
+    frame_context_apply_dict(&ctx, dict);
+
+    if (ctx.frame_content_size != 0 &&
+        ctx.frame_content_size > streams->dst_len) {
+        OUT_SIZE();
+    }
+
+    decompress_data(streams, &ctx);
+
+    free_frame_context(&ctx);
+}
+
+static void init_frame_context(frame_context_t *context) {
+    memset(context, 0x00, sizeof(frame_context_t));
+
+    // Set up the offset history for the repeat offset commands
+    context->previous_offsets[1] = 1;
+    context->previous_offsets[2] = 4;
+    context->previous_offsets[3] = 8;
+}
+
+static void free_frame_context(frame_context_t *context) {
+    HUF_free_dtable(&context->literals_dtable);
+
+    FSE_free_dtable(&context->ll_dtable);
+    FSE_free_dtable(&context->ml_dtable);
+    FSE_free_dtable(&context->of_dtable);
+
+    cbuf_free(&context->window);
+
+    memset(context, 0, sizeof(frame_context_t));
+}
+
+static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
+                               dictionary_t *dict) {
+    if (streams->src_len < 1) {
+        INP_SIZE();
+    }
+
+    u8 descriptor = read_bits_LE(streams->src, 8, 0);
+
+    // decode frame header descriptor into flags
+    u8 frame_content_size_flag = descriptor >> 6;
+    u8 single_segment_flag = (descriptor >> 5) & 1;
+    u8 reserved_bit = (descriptor >> 3) & 1;
+    u8 content_checksum_flag = (descriptor >> 2) & 1;
+    u8 dictionary_id_flag = descriptor & 3;
+
+    if (reserved_bit != 0) {
+        CORRUPTION();
+    }
+
+    streams->src++;
+    streams->src_len--;
+
+    ctx->single_segment_flag = single_segment_flag;
+    ctx->content_checksum_flag = content_checksum_flag;
+
+    // decode window size
+    if (!single_segment_flag) {
+        if (streams->src_len < 1) {
+            INP_SIZE();
+        }
+
+        // Use the algorithm from the specification to compute window size
+        // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+        u8 window_descriptor = read_bits_LE(streams->src, 8, 0);
+        u8 exponent = window_descriptor >> 3;
+        u8 mantissa = window_descriptor & 7;
+
+        size_t window_base = (size_t)1 << (10 + exponent);
+        size_t window_add = (window_base / 8) * mantissa;
+        ctx->window_size = window_base + window_add;
+
+        streams->src++;
+        streams->src_len--;
+    }
+
+    // decode dictionary id if it exists
+    if (dictionary_id_flag) {
+        const int bytes_array[] = {0, 1, 2, 4};
+        const int bytes = bytes_array[dictionary_id_flag];
+
+        if (streams->src_len < bytes) {
+            INP_SIZE();
+        }
+
+        ctx->dictionary_id = read_bits_LE(streams->src, bytes * 8, 0);
+        streams->src += bytes;
+        streams->src_len -= bytes;
+    } else {
+        ctx->dictionary_id = 0;
+    }
+
+    // decode frame content size if it exists
+    if (single_segment_flag || frame_content_size_flag) {
+        // if frame_content_size_flag == 0 but single_segment_flag is set, we
+        // still
+        // have a 1 byte field
+        const int bytes_array[] = {1, 2, 4, 8};
+        const int bytes = bytes_array[frame_content_size_flag];
+
+        if (streams->src_len < bytes) {
+            INP_SIZE();
+        }
+
+        ctx->frame_content_size = read_bits_LE(streams->src, bytes * 8, 0);
+        if (bytes == 2) {
+            ctx->frame_content_size += 256;
+        }
+
+        streams->src += bytes;
+        streams->src_len -= bytes;
+    }
+
+    if (single_segment_flag) {
+        ctx->window_size =
+            ctx->frame_content_size + (dict ? dict->content_size : 0);
+        // We need to allocate a buffer to write to of size at least output +
+        // dict
+        // size
+        size_t size = ctx->frame_content_size + (dict ? dict->content_size : 0);
+    }
+
+    // Allocate the window
+    if (ctx->window_size > MAX_WINDOW_SIZE) {
+        ERROR("Requested window size too large");
+    }
+    cbuf_init(&ctx->window, ctx->window_size);
+}
+
+/// A dictionary acts as initializing values for the frame context before
+/// decompression, so we implement it by applying it's predetermined
+/// tables and content to the context before beginning decompression
+static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
+    // If the content pointer is NULL then it must be an empty dict
+    if (!dict || !dict->content)
+        return;
+
+    if (ctx->dictionary_id == 0 && dict->dictionary_id != 0) {
+        // The dictionary is unneeded, and shouldn't be used as it may interfere
+        // with the default offset history
+        return;
+    }
+
+    // If the dictionary id is 0, it doesn't matter if we provide the wrong raw
+    // content dict, it won't change anything
+    if (ctx->dictionary_id != 0 && ctx->dictionary_id != dict->dictionary_id) {
+        ERROR("Wrong/no dictionary provided");
+    }
+
+    // Write the dict data in, and then flush to NULL so it's not sent to the
+    // output stream
+    cbuf_write_data_full(&ctx->window, dict->content, dict->content_size, NULL,
+                         -1);
+    cbuf_flush(&ctx->window, NULL, -1);
+    ctx->current_total_output = dict->content_size;
+
+    // If it's a formatted dict copy the precomputed tables in so they can
+    // be used in the table repeat modes
+    if (dict->dictionary_id != 0) {
+        // Deep copy the entropy tables so they can be freed independently of
+        // the
+        // dictionary struct
+        HUF_copy_dtable(&ctx->literals_dtable, &dict->literals_dtable);
+        FSE_copy_dtable(&ctx->ll_dtable, &dict->ll_dtable);
+        FSE_copy_dtable(&ctx->of_dtable, &dict->of_dtable);
+        FSE_copy_dtable(&ctx->ml_dtable, &dict->ml_dtable);
+
+        memcpy(ctx->previous_offsets, dict->previous_offsets,
+               sizeof(ctx->previous_offsets));
+    }
+}
+
+/// Decompress the data from a frame block by block
+static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
+
+    u8 last_block = 0;
+    do {
+        if (streams->src_len < 3) {
+            INP_SIZE();
+        }
+        // Parse the block header
+        last_block = streams->src[0] & 1;
+        u8 block_type = (streams->src[0] >> 1) & 3;
+        size_t block_len = read_bits_LE(streams->src, 21, 3);
+
+        streams->src += 3;
+        streams->src_len -= 3;
+
+        switch (block_type) {
+        case 0: {
+            // Raw, uncompressed block
+            if (streams->src_len < block_len) {
+                INP_SIZE();
+            }
+            if (streams->dst_len < block_len) {
+                OUT_SIZE();
+            }
+
+            // Write the raw data into the window buffer
+            size_t written =
+                cbuf_write_data_full(&ctx->window, streams->src, block_len,
+                                     streams->dst, streams->dst_len);
+            streams->src += block_len;
+            streams->src_len -= block_len;
+
+            streams->dst += written;
+            streams->dst_len -= written;
+            break;
+        }
+        case 1: {
+            // RLE block, repeat the first byte N times
+            if (streams->src_len < 1) {
+                INP_SIZE();
+            }
+            if (streams->dst_len < block_len) {
+                OUT_SIZE();
+            }
+
+            // Write streams->src[0] into the buffer block_len times
+            size_t written =
+                cbuf_repeat_byte_full(&ctx->window, streams->src[0], block_len,
+                                      streams->dst, streams->dst_len);
+            streams->dst += written;
+            streams->dst_len -= written;
+
+            streams->src += 1;
+            streams->src_len -= 1;
+            break;
+        }
+        case 2:
+            // Compressed block, this is mode complex
+            decompress_block(streams, ctx, block_len);
+            break;
+        }
+    } while (!last_block);
+
+    // Flush out anything left in the window buffer to the destination stream
+    size_t written = cbuf_flush(&ctx->window, streams->dst, streams->dst_len);
+    streams->dst += written;
+    streams->dst_len -= written;
+
+    if (ctx->content_checksum_flag) {
+        // This program does not support checking the checksum, so skip over it
+        // if
+        // it's present
+        if (streams->src_len < 4) {
+            INP_SIZE();
+        }
+        streams->src += 4;
+        streams->src_len -= 4;
+    }
+}
+/******* END FRAME DECODING ***************************************************/
+
+/******* BLOCK DECOMPRESSION **************************************************/
+static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
+                             size_t block_len) {
+    if (streams->src_len < block_len) {
+        INP_SIZE();
+    }
+    // We need this to determine how long the compressed literals block was
+    const u8 *const end_of_block = streams->src + block_len;
+
+    // Part 1: decode the literals block
+    u8 *literals = NULL;
+    size_t literals_size = decode_literals(streams, ctx, &literals);
+
+    // Part 2: decode the sequences block
+    if (streams->src > end_of_block) {
+        INP_SIZE();
+    }
+    size_t sequences_size = end_of_block - streams->src;
+    sequence_command_t *sequences = NULL;
+    size_t num_sequences =
+        decode_sequences(ctx, streams->src, sequences_size, &sequences);
+
+    streams->src += sequences_size;
+    streams->src_len -= sequences_size;
+
+    // Part 3: combine literals and sequence commands to generate output
+    execute_sequences(streams, ctx, sequences, num_sequences, literals,
+                      literals_size);
+    free(literals);
+    free(sequences);
+}
+/******* END BLOCK DECOMPRESSION **********************************************/
+
+/******* LITERALS DECODING ****************************************************/
+static size_t decode_literals_simple(io_streams_t *streams, u8 **literals,
+                                     int block_type, int size_format);
+static size_t decode_literals_compressed(io_streams_t *streams,
+                                         frame_context_t *ctx, u8 **literals,
+                                         int block_type, int size_format);
+static size_t decode_huf_table(const u8 *src, size_t src_len,
+                               HUF_dtable *dtable);
+static size_t fse_decode_hufweights(const u8 *src, size_t src_len, u8 *weights,
+                                    int *num_symbs, size_t compressed_size);
+
+static size_t decode_literals(io_streams_t *streams, frame_context_t *ctx,
+                              u8 **literals) {
+    if (streams->src_len < 1) {
+        INP_SIZE();
+    }
+    // Decode literals header
+    int block_type = streams->src[0] & 3;
+    int size_format = (streams->src[0] >> 2) & 3;
+
+    if (block_type <= 1) {
+        // Raw or RLE literals block
+        return decode_literals_simple(streams, literals, block_type,
+                                      size_format);
+    } else {
+        // Huffman compressed literals
+        return decode_literals_compressed(streams, ctx, literals, block_type,
+                                          size_format);
+    }
+}
+
+/// Decodes literals blocks in raw or RLE form
+static size_t decode_literals_simple(io_streams_t *streams, u8 **literals,
+                                     int block_type, int size_format) {
+    size_t size;
+    switch (size_format) {
+    // These cases are in the form X0
+    // In this case, the X bit is actually part of the size field
+    case 0:
+    case 2:
+        size = read_bits_LE(streams->src, 5, 3);
+        streams->src += 1;
+        streams->src_len -= 1;
+        break;
+    case 1:
+        if (streams->src_len < 2) {
+            INP_SIZE();
+        }
+        size = read_bits_LE(streams->src, 12, 4);
+        streams->src += 2;
+        streams->src_len -= 2;
+        break;
+    case 3:
+        if (streams->src_len < 2) {
+            INP_SIZE();
+        }
+        size = read_bits_LE(streams->src, 20, 4);
+        streams->src += 3;
+        streams->src_len -= 3;
+        break;
+    default:
+        // Impossible
+        size = -1;
+    }
+
+    if (size > MAX_LITERALS_SIZE) {
+        CORRUPTION();
+    }
+
+    *literals = malloc(size);
+    if (!*literals) {
+        BAD_ALLOC();
+    }
+
+    switch (block_type) {
+    case 0:
+        // Raw data
+        if (size > streams->src_len) {
+            INP_SIZE();
+        }
+        memcpy(*literals, streams->src, size);
+        streams->src += size;
+        streams->src_len -= size;
+        break;
+    case 1:
+        // Single repeated byte
+        if (1 > streams->src_len) {
+            INP_SIZE();
+        }
+        memset(*literals, streams->src[0], size);
+        streams->src += 1;
+        streams->src_len -= 1;
+        break;
+    }
+
+    return size;
+}
+
+/// Decodes Huffman compressed literals
+static size_t decode_literals_compressed(io_streams_t *streams,
+                                         frame_context_t *ctx, u8 **literals,
+                                         int block_type, int size_format) {
+    size_t regenerated_size, compressed_size;
+    // Only size_format=0 has 1 stream, so default to 4
+    int num_streams = 4;
+    switch (size_format) {
+    case 0:
+        num_streams = 1;
+    // Fall through as it has the same size format
+    case 1:
+        if (streams->src_len < 3) {
+            INP_SIZE();
+        }
+        regenerated_size = read_bits_LE(streams->src, 10, 4);
+        compressed_size = read_bits_LE(streams->src, 10, 14);
+        streams->src += 3;
+        streams->src_len -= 3;
+        break;
+    case 2:
+        if (streams->src_len < 4) {
+            INP_SIZE();
+        }
+        regenerated_size = read_bits_LE(streams->src, 14, 4);
+        compressed_size = read_bits_LE(streams->src, 14, 18);
+        streams->src += 4;
+        streams->src_len -= 4;
+        break;
+    case 3:
+        if (streams->src_len < 5) {
+            INP_SIZE();
+        }
+        regenerated_size = read_bits_LE(streams->src, 18, 4);
+        compressed_size = read_bits_LE(streams->src, 18, 22);
+        streams->src += 5;
+        streams->src_len -= 5;
+        break;
+    default:
+        // Impossible
+        compressed_size = regenerated_size = -1;
+    }
+    if (regenerated_size > MAX_LITERALS_SIZE ||
+        compressed_size > regenerated_size) {
+        CORRUPTION();
+    }
+
+    if (compressed_size > streams->src_len) {
+        INP_SIZE();
+    }
+
+    *literals = malloc(regenerated_size);
+    if (!*literals) {
+        BAD_ALLOC();
+    }
+
+    if (block_type == 2) {
+        // Decode provided Huffman table
+
+        HUF_free_dtable(&ctx->literals_dtable);
+        size_t size = decode_huf_table(streams->src, compressed_size,
+                                       &ctx->literals_dtable);
+        streams->src += size;
+        streams->src_len -= size;
+        compressed_size -= size;
+    } else {
+        // If we're to repeat the previous Huffman table, make sure it exists
+        if (!ctx->literals_dtable.symbols) {
+            CORRUPTION();
+        }
+    }
+
+    if (num_streams == 1) {
+        HUF_decompress_1stream(&ctx->literals_dtable, *literals,
+                               regenerated_size, streams->src, compressed_size);
+    } else {
+        HUF_decompress_4stream(&ctx->literals_dtable, *literals,
+                               regenerated_size, streams->src, compressed_size);
+    }
+    streams->src += compressed_size;
+    streams->src_len -= compressed_size;
+
+    return regenerated_size;
+}
+
+// Decode the Huffman table description
+static size_t decode_huf_table(const u8 *src, size_t src_len,
+                               HUF_dtable *dtable) {
+    if (src_len < 1) {
+        INP_SIZE();
+    }
+
+    const u8 *const osrc = src;
+
+    u8 header = src[0];
+    u8 weights[HUF_MAX_SYMBS];
+    memset(weights, 0, sizeof(weights));
+
+    src++;
+    src_len--;
+
+    int num_symbs;
+
+    if (header >= 128) {
+        // Direct representation, read the weights out
+        num_symbs = header - 127;
+        size_t bytes = (num_symbs + 1) / 2;
+
+        if (bytes > src_len) {
+            INP_SIZE();
+        }
+
+        for (int i = 0; i < num_symbs; i++) {
+            if (i % 2 == 0) {
+                weights[i] = src[i / 2] >> 4;
+            } else {
+                weights[i] = src[i / 2] & 0xf;
+            }
+        }
+
+        src += bytes;
+        src_len -= bytes;
+    } else {
+        // The weights are FSE encoded, decode them before we can construct the
+        // table
+        size_t size =
+            fse_decode_hufweights(src, src_len, weights, &num_symbs, header);
+        src += size;
+        src_len -= size;
+    }
+
+    // Construct the table using the decoded weights
+    HUF_init_dtable_usingweights(dtable, weights, num_symbs);
+    return src - osrc;
+}
+
+static size_t fse_decode_hufweights(const u8 *src, size_t src_len, u8 *weights,
+                                    int *num_symbs, size_t compressed_size) {
+    const int MAX_ACCURACY_LOG = 7;
+
+    FSE_dtable dtable;
+
+    // Construct the FSE table
+    size_t read = FSE_decode_header(&dtable, src, src_len, MAX_ACCURACY_LOG);
+
+    if (src_len < compressed_size) {
+        INP_SIZE();
+    }
+
+    // Decode the weights
+    *num_symbs = FSE_decompress_interleaved2(
+        &dtable, weights, HUF_MAX_SYMBS, src + read, compressed_size - read);
+
+    FSE_free_dtable(&dtable);
+
+    return compressed_size;
+}
+/******* END LITERALS DECODING ************************************************/
+
+/******* SEQUENCE DECODING ****************************************************/
+/// The combination of FSE states needed to decode sequences
+typedef struct {
+    u16 ll_state, of_state, ml_state;
+    FSE_dtable ll_table, of_table, ml_table;
+} sequence_state_t;
+
+/// Different modes to signal to decode_seq_tables what to do
+typedef enum {
+    seq_literal_length = 0,
+    seq_offset = 1,
+    seq_match_length = 2,
+} seq_part_t;
+
+typedef enum {
+    seq_predefined = 0,
+    seq_rle = 1,
+    seq_fse = 2,
+    seq_repeat = 3,
+} seq_mode_t;
+
+/// The predefined FSE distribution tables for `seq_predefined` mode
+static const i16 SEQ_LITERAL_LENGTH_DEFAULT_DIST[36] = {
+    4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,  1,  2,  2,
+    2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1};
+static const i16 SEQ_OFFSET_DEFAULT_DIST[29] = {
+    1, 1, 1, 1, 1, 1, 2, 2, 2, 1,  1,  1,  1,  1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1};
+static const i16 SEQ_MATCH_LENGTH_DEFAULT_DIST[53] = {
+    1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1,  1,  1,  1,  1,  1,  1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1};
+
+/// The sequence decoding baseline and number of additional bits to read/add
+/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets
+static const u32 SEQ_LITERAL_LENGTH_BASELINES[36] = {
+    0,  1,  2,   3,   4,   5,    6,    7,    8,    9,     10,    11,
+    12, 13, 14,  15,  16,  18,   20,   22,   24,   28,    32,    40,
+    48, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65538};
+static const u8 SEQ_LITERAL_LENGTH_EXTRA_BITS[36] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  1,  1,
+    1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+static const u32 SEQ_MATCH_LENGTH_BASELINES[53] = {
+    3,  4,   5,   6,   7,    8,    9,    10,   11,    12,    13,   14, 15, 16,
+    17, 18,  19,  20,  21,   22,   23,   24,   25,    26,    27,   28, 29, 30,
+    31, 32,  33,  34,  35,   37,   39,   41,   43,    47,    51,   59, 67, 83,
+    99, 131, 259, 515, 1027, 2051, 4099, 8195, 16387, 32771, 65539};
+static const u8 SEQ_MATCH_LENGTH_EXTRA_BITS[53] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  1,  1,  1, 1,
+    2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+/// Offset decoding is simpler so we just need a maximum code value
+static const u8 SEQ_MAX_CODES[3] = {35, -1, 52};
+
+static void decompress_sequences(frame_context_t *ctx, const u8 *src,
+                                 size_t src_len, sequence_command_t *sequences,
+                                 size_t num_sequences);
+static sequence_command_t decode_sequence(sequence_state_t *state,
+                                          const u8 *src, i64 *offset);
+static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
+                               seq_part_t type, seq_mode_t mode);
+
+static size_t decode_sequences(frame_context_t *ctx, const u8 *src,
+                               size_t src_len, sequence_command_t **sequences) {
+    size_t num_sequences;
+
+    // Decode the sequence header and allocate space for the output
+    if (src_len < 1) {
+        INP_SIZE();
+    }
+    if (src[0] == 0) {
+        *sequences = NULL;
+        return 0;
+    } else if (src[0] < 128) {
+        num_sequences = src[0];
+        src++;
+        src_len--;
+    } else if (src[0] < 255) {
+        if (src_len < 2) {
+            INP_SIZE();
+        }
+        num_sequences = ((src[0] - 128) << 8) + src[1];
+        src += 2;
+        src_len -= 2;
+    } else {
+        if (src_len < 3) {
+            INP_SIZE();
+        }
+        num_sequences = src[1] + ((u64)src[2] << 8) + 0x7F00;
+        src += 3;
+        src_len -= 3;
+    }
+
+    *sequences = malloc(num_sequences * sizeof(sequence_command_t));
+    if (!*sequences) {
+        BAD_ALLOC();
+    }
+
+    decompress_sequences(ctx, src, src_len, *sequences, num_sequences);
+    return num_sequences;
+}
+
+/// Decompress the FSE encoded sequence commands
+static void decompress_sequences(frame_context_t *ctx, const u8 *src,
+                                 size_t src_len, sequence_command_t *sequences,
+                                 size_t num_sequences) {
+    if (src_len < 1) {
+        INP_SIZE();
+    }
+    u8 compression_modes = src[0];
+    src++;
+    src_len--;
+
+    if ((compression_modes & 3) != 0) {
+        CORRUPTION();
+    }
+
+    sequence_state_t state;
+    size_t read;
+    // Update the tables we have stored in the context
+    read = decode_seq_table(src, src_len, &ctx->ll_dtable, seq_literal_length,
+                            (compression_modes >> 6) & 3);
+    src += read;
+    src_len -= read;
+    read = decode_seq_table(src, src_len, &ctx->of_dtable, seq_offset,
+                            (compression_modes >> 4) & 3);
+    src += read;
+    src_len -= read;
+    read = decode_seq_table(src, src_len, &ctx->ml_dtable, seq_match_length,
+                            (compression_modes >> 2) & 3);
+    src += read;
+    src_len -= read;
+
+    // Check to make sure none of the tables are uninitialized
+    if (!ctx->ll_dtable.symbols || !ctx->of_dtable.symbols ||
+        !ctx->ml_dtable.symbols) {
+        CORRUPTION();
+    }
+
+    // Now use the context's tables
+    memcpy(&state.ll_table, &ctx->ll_dtable, sizeof(FSE_dtable));
+    memcpy(&state.of_table, &ctx->of_dtable, sizeof(FSE_dtable));
+    memcpy(&state.ml_table, &ctx->ml_dtable, sizeof(FSE_dtable));
+
+    int padding = 8 - log2inf(src[src_len - 1]);
+    i64 offset = src_len * 8 - padding;
+
+    FSE_init_state(&state.ll_table, &state.ll_state, src, &offset);
+    FSE_init_state(&state.of_table, &state.of_state, src, &offset);
+    FSE_init_state(&state.ml_table, &state.ml_state, src, &offset);
+
+    for (size_t i = 0; i < num_sequences; i++) {
+        // Decode sequences one by one
+        sequences[i] = decode_sequence(&state, src, &offset);
+    }
+
+    if (offset != 0) {
+        CORRUPTION();
+    }
+
+    // Don't free our tables so they can be used in the next block
+}
+
+// Decode a single sequence and update the state
+static sequence_command_t decode_sequence(sequence_state_t *state,
+                                          const u8 *src, i64 *offset) {
+    // Decode symbols, but don't update states
+    u8 of_code = FSE_peek_symbol(&state->of_table, state->of_state);
+    u8 ll_code = FSE_peek_symbol(&state->ll_table, state->ll_state);
+    u8 ml_code = FSE_peek_symbol(&state->ml_table, state->ml_state);
+
+    // Offset doesn't need a max value as it's not decoded using a table
+    if (ll_code > SEQ_MAX_CODES[seq_literal_length] ||
+        ml_code > SEQ_MAX_CODES[seq_match_length]) {
+        CORRUPTION();
+    }
+
+    // Read the interleaved bits
+    sequence_command_t seq;
+    // Offset computation works differently
+    seq.offset = ((u32)1 << of_code) + STREAM_read_bits(src, of_code, offset);
+    seq.match_length =
+        SEQ_MATCH_LENGTH_BASELINES[ml_code] +
+        STREAM_read_bits(src, SEQ_MATCH_LENGTH_EXTRA_BITS[ml_code], offset);
+    seq.literal_length =
+        SEQ_LITERAL_LENGTH_BASELINES[ll_code] +
+        STREAM_read_bits(src, SEQ_LITERAL_LENGTH_EXTRA_BITS[ll_code], offset);
+
+    // If the stream is complete don't read bits to update state
+    if (*offset != 0) {
+        // Update state in the order specified in the specification
+        FSE_update_state(&state->ll_table, &state->ll_state, src, offset);
+        FSE_update_state(&state->ml_table, &state->ml_state, src, offset);
+        FSE_update_state(&state->of_table, &state->of_state, src, offset);
+    }
+
+    return seq;
+}
+
+/// Given a sequence part and table mode, decode the FSE distribution
+static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
+                               seq_part_t type, seq_mode_t mode) {
+
+    // Constant arrays indexed by seq_part_t
+    const i16 *const default_distributions[] = {SEQ_LITERAL_LENGTH_DEFAULT_DIST,
+                                                SEQ_OFFSET_DEFAULT_DIST,
+                                                SEQ_MATCH_LENGTH_DEFAULT_DIST};
+    const size_t default_distribution_lengths[] = {36, 29, 53};
+    const size_t default_distribution_accuracies[] = {6, 5, 6};
+
+    const size_t max_accuracies[] = {9, 8, 9};
+
+    if (mode != seq_repeat) {
+        // ree old one before overwriting
+        FSE_free_dtable(table);
+    }
+
+    switch (mode) {
+    case seq_predefined: {
+        const i16 *distribution = default_distributions[type];
+        const size_t symbs = default_distribution_lengths[type];
+        const size_t accuracy_log = default_distribution_accuracies[type];
+
+        FSE_init_dtable(table, distribution, symbs, accuracy_log);
+
+        return 0;
+    }
+    case seq_rle: {
+        if (src_len < 1) {
+            INP_SIZE();
+        }
+        u8 symb = src[0];
+        src++;
+        src_len--;
+        FSE_init_dtable_rle(table, symb);
+
+        return 1;
+    }
+    case seq_fse: {
+        size_t read =
+            FSE_decode_header(table, src, src_len, max_accuracies[type]);
+        src += read;
+        src_len -= read;
+
+        return read;
+    }
+    case seq_repeat:
+        // Don't have to do anything here as we're not changing the table
+        return 0;
+    default:
+        // Impossible, as mode is from 0-3
+        return -1;
+    }
+}
+/******* END SEQUENCE DECODING ************************************************/
+
+/******* SEQUENCE EXECUTION ***************************************************/
+static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
+                                sequence_command_t *sequences,
+                                size_t num_sequences, const u8 *literals,
+                                size_t literals_len) {
+    u64 *offset_hist = ctx->previous_offsets;
+    size_t total_output = ctx->current_total_output;
+
+    for (size_t i = 0; i < num_sequences; i++) {
+        sequence_command_t seq = sequences[i];
+
+        if (seq.literal_length > literals_len) {
+            CORRUPTION();
+        }
+
+        {
+            // Copy literals to the buffer
+            size_t written =
+                cbuf_write_data_full(&ctx->window, literals, seq.literal_length,
+                                     streams->dst, streams->dst_len);
+
+            literals += seq.literal_length;
+            literals_len -= seq.literal_length;
+
+            streams->dst += written;
+            streams->dst_len -= written;
+
+            total_output += seq.literal_length;
+        }
+
+        size_t offset;
+
+        // Offsets are special, we need to handle the repeat offsets
+        if (seq.offset <= 3) {
+            u32 idx = seq.offset;
+            if (seq.literal_length == 0) {
+                // Special case when literal length is 0
+                idx++;
+            }
+
+            if (idx == 1) {
+                offset = offset_hist[1];
+            } else {
+                // If idx == 4 then literal length was 0 and the offset was 3
+                offset = idx < 4 ? offset_hist[idx] : offset_hist[1] - 1;
+
+                // If idx == 2 we don't need to modify offset_hist[3]
+                if (idx > 2) {
+                    offset_hist[3] = offset_hist[2];
+                }
+                offset_hist[2] = offset_hist[1];
+                offset_hist[1] = offset;
+            }
+        } else {
+            offset = seq.offset - 3;
+
+            // Shift back history
+            offset_hist[3] = offset_hist[2];
+            offset_hist[2] = offset_hist[1];
+            offset_hist[1] = offset;
+        }
+
+        if (offset > total_output) {
+            CORRUPTION();
+        }
+
+        {
+            // Do the offset copy operation
+            size_t written =
+                cbuf_copy_offset_full(&ctx->window, offset, seq.match_length,
+                                      streams->dst, streams->dst_len);
+
+            streams->dst += written;
+            streams->dst_len -= written;
+            total_output += seq.match_length;
+        }
+    }
+
+    {
+        // Copy any leftover literal bytes
+        size_t written =
+            cbuf_write_data_full(&ctx->window, literals, literals_len,
+                                 streams->dst, streams->dst_len);
+        streams->dst += written;
+        streams->dst_len -= written;
+
+        total_output += literals_len;
+    }
+
+    ctx->current_total_output = total_output;
+
+    return total_output;
+}
+/******* END SEQUENCE EXECUTION ***********************************************/
+
+/******* DICTIONARY PARSING ***************************************************/
+static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
+                                  size_t src_len);
+
+static void parse_dictionary(dictionary_t *dict, const u8 *src,
+                             size_t src_len) {
+    memset(dict, 0, sizeof(dictionary_t));
+    if (src_len < 8) {
+        INP_SIZE();
+    }
+    u32 magic_number = read_bits_LE(src, 32, 0);
+    if (magic_number != 0xEC30A437) {
+        // raw content dict
+        init_raw_content_dict(dict, src, src_len);
+        return;
+    }
+    dict->dictionary_id = read_bits_LE(src, 32, 32);
+
+    src += 8;
+    src_len -= 8;
+
+    // Parse the provided entropy tables in order
+    {
+        size_t read = decode_huf_table(src, src_len, &dict->literals_dtable);
+        src += read;
+        src_len -= read;
+    }
+    {
+        size_t read = decode_seq_table(src, src_len, &dict->of_dtable,
+                                       seq_offset, seq_fse);
+        src += read;
+        src_len -= read;
+    }
+    {
+        size_t read = decode_seq_table(src, src_len, &dict->ml_dtable,
+                                       seq_match_length, seq_fse);
+        src += read;
+        src_len -= read;
+    }
+    {
+        size_t read = decode_seq_table(src, src_len, &dict->ll_dtable,
+                                       seq_literal_length, seq_fse);
+        src += read;
+        src_len -= read;
+    }
+
+    if (src_len < 12) {
+        INP_SIZE();
+    }
+    // Read in the previous offset history
+    dict->previous_offsets[1] = read_bits_LE(src, 32, 0);
+    dict->previous_offsets[2] = read_bits_LE(src, 32, 32);
+    dict->previous_offsets[3] = read_bits_LE(src, 32, 64);
+
+    src += 12;
+    src_len -= 12;
+
+    // Ensure the provided offsets aren't too large
+    for (int i = 1; i <= 3; i++) {
+        if (dict->previous_offsets[i] > src_len) {
+            ERROR("Dictionary corrupted");
+        }
+    }
+    // The rest is the content
+    dict->content = malloc(src_len);
+    if (!dict->content) {
+        BAD_ALLOC();
+    }
+
+    dict->content_size = src_len;
+    memcpy(dict->content, src, src_len);
+}
+
+/// If parse_dictionary is given a raw content dictionary, it delegates here
+static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
+                                  size_t src_len) {
+    dict->dictionary_id = 0;
+    // Copy in the content
+    dict->content = malloc(src_len);
+    if (!dict->content) {
+        BAD_ALLOC();
+    }
+
+    dict->content_size = src_len;
+    memcpy(dict->content, src, src_len);
+}
+
+/// Free an allocated dictionary
+static void free_dictionary(dictionary_t *dict) {
+    HUF_free_dtable(&dict->literals_dtable);
+    FSE_free_dtable(&dict->ll_dtable);
+    FSE_free_dtable(&dict->of_dtable);
+    FSE_free_dtable(&dict->ml_dtable);
+
+    free(dict->content);
+
+    memset(dict, 0, sizeof(dictionary_t));
+}
+/******* END DICTIONARY PARSING ***********************************************/
+
+/******* CIRCULAR BUFFER ******************************************************/
+static void cbuf_init(cbuf_t *buf, size_t size) {
+    buf->ptr = malloc(size);
+
+    if (!buf->ptr) {
+        BAD_ALLOC();
+    }
+
+    memset(buf->ptr, 0x3f, size);
+
+    buf->size = size;
+    buf->idx = 0;
+    buf->last_flush = 0;
+}
+
+static size_t cbuf_write_data(cbuf_t *buf, const u8 *src, size_t src_len) {
+    if (buf->size == 0 && src_len > 0) {
+        CORRUPTION();
+    }
+    size_t max_len = buf->size - buf->idx;
+    size_t len = MIN(src_len, max_len);
+
+    memcpy(buf->ptr + buf->idx, src, len);
+
+    buf->idx += len;
+
+    return len;
+}
+
+static size_t cbuf_write_data_full(cbuf_t *buf, const u8 *src, size_t src_len,
+                                   u8 *out, size_t out_len) {
+    size_t written = 0;
+    size_t flushed = 0;
+    while (1) {
+        written += cbuf_write_data(buf, src + written, src_len - written);
+        if (written == src_len) {
+            break;
+        } else {
+            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
+        }
+    }
+
+    return flushed;
+}
+
+static size_t cbuf_copy_offset(cbuf_t *buf, size_t offset, size_t len) {
+    if (buf->size == 0 && len > 0) {
+        CORRUPTION();
+    }
+    if (offset > buf->size) {
+        CORRUPTION();
+    }
+    size_t max_len = buf->size - buf->idx;
+    len = MIN(len, max_len);
+
+    size_t read_off = (buf->idx + buf->size - offset) % buf->size;
+
+    for (size_t i = 0; i < len; i++) {
+        buf->ptr[buf->idx++] = buf->ptr[read_off++];
+        if (read_off == buf->size) {
+            read_off = 0;
+        }
+    }
+
+    return len;
+}
+
+static size_t cbuf_copy_offset_full(cbuf_t *buf, size_t offset, size_t len,
+                                    u8 *out, size_t out_len) {
+    size_t written = 0;
+    size_t flushed = 0;
+    while (1) {
+        written += cbuf_copy_offset(buf, offset, len - written);
+        if (written == len) {
+            break;
+        } else {
+            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
+        }
+    }
+
+    return flushed;
+}
+
+static size_t cbuf_repeat_byte(cbuf_t *buf, u8 byte, size_t len) {
+    if (buf->size == 0 && len > 0) {
+        CORRUPTION();
+    }
+    size_t max_len = buf->size - buf->idx;
+    len = MIN(len, max_len);
+
+    memset(buf->ptr + buf->idx, byte, len);
+
+    return len;
+}
+
+static size_t cbuf_repeat_byte_full(cbuf_t *buf, u8 byte, size_t len, u8 *out,
+                                    size_t out_len) {
+    size_t written = 0;
+    size_t flushed = 0;
+    while (1) {
+        written += cbuf_repeat_byte(buf, byte, len - written);
+        if (written == len) {
+            break;
+        } else {
+            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
+        }
+    }
+
+    return flushed;
+}
+
+static size_t cbuf_flush(cbuf_t *buf, u8 *dst, size_t dst_len) {
+    if (buf->idx < buf->last_flush) {
+        CORRUPTION();
+    }
+
+    size_t len = buf->idx - buf->last_flush;
+
+    if (dst && len > dst_len) {
+        OUT_SIZE();
+    }
+
+    // allow for NULL buffers to indicate flushing to nowhere
+    if (dst) {
+        memcpy(dst, buf->ptr + buf->last_flush, len);
+    }
+
+    // we could have a 0 size buffer
+    if (buf->size) {
+        buf->idx = buf->idx % buf->size;
+    }
+    buf->last_flush = buf->idx;
+
+    return len;
+}
+
+static void cbuf_free(cbuf_t *buf) {
+    free(buf->ptr);
+    memset(buf, 0, sizeof(cbuf_t));
+}
+/******* END CIRCULAR BUFFER **************************************************/
+
+/******* BITSTREAM OPERATIONS *************************************************/
+static inline u64 read_bits_LE(const u8 *src, int num, size_t offset) {
+    if (num > 64) {
+        return -1;
+    }
+
+    src += offset / 8;
+    offset %= 8;
+    u64 res = 0;
+
+    int shift = 0;
+    int left = num;
+    while (left > 0) {
+        u64 mask = left >= 8 ? 0xff : (((u64)1 << left) - 1);
+        res += (((u64)*src++ >> offset) & mask) << shift;
+        shift += 8 - offset;
+        left -= 8 - offset;
+        offset = 0;
+    }
+
+    return res;
+}
+
+static inline u64 STREAM_read_bits(const u8 *src, int bits, i64 *offset) {
+    *offset = *offset - bits;
+    size_t actual_off = *offset;
+    if (*offset < 0) {
+        bits += *offset;
+        actual_off = 0;
+    }
+    u64 res = read_bits_LE(src, bits, actual_off);
+
+    if (*offset < 0) {
+        // Fill in the bottom "overflowed" bits with 0's
+        res = -*offset >= 64 ? 0 : (res << -*offset);
+    }
+    return res;
+}
+/******* END BITSTREAM OPERATIONS *********************************************/
+
+/******* BIT COUNTING OPERATIONS **********************************************/
+static inline int log2sup(u64 num) {
+    for (int i = 0; i < 64; i++) {
+        if (((u64)1 << i) >= num) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+static inline int log2inf(u64 num) {
+    for (int i = 63; i >= 0; i--) {
+        if (((u64)1 << i) <= num) {
+            return i;
+        }
+    }
+    return -1;
+}
+/******* END BIT COUNTING OPERATIONS ******************************************/
+
+/******* HUFFMAN PRIMITIVES ***************************************************/
+static inline u8 HUF_decode_symbol(HUF_dtable *dtable, u16 *state,
+                                   const u8 *src, i64 *offset) {
+    // Look up the symbol and number of bits to read
+    const u8 symb = dtable->symbols[*state];
+    const u8 bits = dtable->num_bits[*state];
+    const u16 rest = STREAM_read_bits(src, bits, offset);
+    *state = ((*state << bits) + rest) & (((u16)1 << dtable->max_bits) - 1);
+
+    return symb;
+}
+
+static inline void HUF_init_state(HUF_dtable *dtable, u16 *state, const u8 *src,
+                                  i64 *offset) {
+    // Read in a full dtable->max_bits to initialize the state
+    const u8 bits = dtable->max_bits;
+    *state = STREAM_read_bits(src, bits, offset);
+}
+
+static size_t HUF_decompress_1stream(HUF_dtable *dtable, u8 *dst,
+                                     size_t dst_len, const u8 *src,
+                                     size_t src_len) {
+    u8 *const dst_max = dst + dst_len;
+    u8 *const odst = dst;
+
+    // To maintain similarity with FSE, start from the end
+    // Find the last 1 bit
+    int padding = 8 - log2inf(src[src_len - 1]);
+
+    i64 offset = src_len * 8 - padding;
+    u16 state;
+
+    HUF_init_state(dtable, &state, src, &offset);
+
+    while (dst < dst_max && offset > -dtable->max_bits) {
+        *dst++ = HUF_decode_symbol(dtable, &state, src, &offset);
+    }
+    // If we stopped before consuming all the input, we didn't have enough space
+    if (dst == dst_max && offset > -dtable->max_bits) {
+        OUT_SIZE();
+    }
+
+    // The current state should be the `max_bits` preceding the start as
+    // everything from `src` onward should be consumed
+    if (offset != -dtable->max_bits) {
+        CORRUPTION();
+    }
+
+    return dst - odst;
+}
+
+static size_t HUF_decompress_4stream(HUF_dtable *dtable, u8 *dst,
+                                     size_t dst_len, const u8 *src,
+                                     size_t src_len) {
+    // Decode each stream independently for simplicity
+    // If we wanted to we could decode all 4 at the same time for speed,
+    // utilizing
+    // more execution units
+
+    const u8 *src1, *src2, *src3, *src4, *src_end;
+    u8 *dst1, *dst2, *dst3, *dst4, *dst_end;
+
+    size_t total_out = 0;
+
+    if (src_len < 6) {
+        INP_SIZE();
+    }
+
+    src1 = src + 6;
+    src2 = src1 + read_bits_LE(src, 16, 0);
+    src3 = src2 + read_bits_LE(src, 16, 16);
+    src4 = src3 + read_bits_LE(src, 16, 32);
+    src_end = src + src_len;
+
+    // We can't test with all 4 sizes because the 4th size is a function of the
+    // other 3 and the provided length
+    if (src4 - src >= src_len) {
+        INP_SIZE();
+    }
+
+    size_t segment_size = (dst_len + 3) / 4;
+    dst1 = dst;
+    dst2 = dst1 + segment_size;
+    dst3 = dst2 + segment_size;
+    dst4 = dst3 + segment_size;
+    dst_end = dst + dst_len;
+
+    total_out +=
+        HUF_decompress_1stream(dtable, dst1, segment_size, src1, src2 - src1);
+    total_out +=
+        HUF_decompress_1stream(dtable, dst2, segment_size, src2, src3 - src2);
+    total_out +=
+        HUF_decompress_1stream(dtable, dst3, segment_size, src3, src4 - src3);
+    total_out += HUF_decompress_1stream(dtable, dst4, dst_end - dst4, src4,
+                                        src_end - src4);
+
+    return total_out;
+}
+
+static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs) {
+    memset(table, 0, sizeof(HUF_dtable));
+    if (num_symbs > HUF_MAX_SYMBS) {
+        ERROR("Too many symbols for Huffman");
+    }
+
+    u8 max_bits = 0;
+    u16 rank_count[HUF_MAX_BITS + 1];
+    memset(rank_count, 0, sizeof(rank_count));
+
+    // Count the number of symbols for each number of bits, and determine the
+    // depth of the tree
+    for (int i = 0; i < num_symbs; i++) {
+        if (bits[i] > HUF_MAX_BITS) {
+            ERROR("Huffman table depth too large");
+        }
+        max_bits = MAX(max_bits, bits[i]);
+        rank_count[bits[i]]++;
+    }
+
+    size_t table_size = 1 << max_bits;
+    table->max_bits = max_bits;
+    table->symbols = malloc(table_size);
+    table->num_bits = malloc(table_size);
+
+    if (!table->symbols || !table->num_bits) {
+        free(table->symbols);
+        free(table->num_bits);
+        BAD_ALLOC();
+    }
+
+    u32 rank_idx[HUF_MAX_BITS + 1];
+    // Initialize the starting codes for each rank (number of bits)
+    rank_idx[max_bits] = 0;
+    for (int i = max_bits; i >= 1; i--) {
+        rank_idx[i - 1] = rank_idx[i] + rank_count[i] * (1 << (max_bits - i));
+        // The entire range takes the same number of bits so we can memset it
+        memset(&table->num_bits[rank_idx[i]], i, rank_idx[i - 1] - rank_idx[i]);
+    }
+
+    if (rank_idx[0] != table_size) {
+        CORRUPTION();
+    }
+
+    // Allocate codes and fill in the table
+    for (int i = 0; i < num_symbs; i++) {
+        if (bits[i] != 0) {
+            // Allocate a code for this symbol and set its range in the table
+            const u16 code = rank_idx[bits[i]];
+            const u16 len = 1 << (max_bits - bits[i]);
+            memset(&table->symbols[code], i, len);
+            rank_idx[bits[i]] += len;
+        }
+    }
+}
+
+static void HUF_init_dtable_usingweights(HUF_dtable *table, u8 *weights,
+                                         int num_symbs) {
+    // +1 because the last weight is not transmitted in the header
+    if (num_symbs + 1 > HUF_MAX_SYMBS) {
+        ERROR("Too many symbols for Huffman");
+    }
+
+    u8 bits[HUF_MAX_SYMBS];
+
+    u64 weight_sum = 0;
+    for (int i = 0; i < num_symbs; i++) {
+        weight_sum += weights[i] > 0 ? (u64)1 << (weights[i] - 1) : 0;
+    }
+
+    // Find the first power of 2 larger than the sum
+    int max_bits = log2inf(weight_sum) + 1;
+    u64 left_over = ((u64)1 << max_bits) - weight_sum;
+    // If the left over isn't a power of 2, the weights are invalid
+    if (left_over & (left_over - 1)) {
+        CORRUPTION();
+    }
+
+    int last_weight = log2inf(left_over) + 1;
+
+    for (int i = 0; i < num_symbs; i++) {
+        bits[i] = weights[i] > 0 ? (max_bits + 1 - weights[i]) : 0;
+    }
+    bits[num_symbs] =
+        max_bits + 1 - last_weight; // last weight is always non-zero
+
+    HUF_init_dtable(table, bits, num_symbs + 1);
+}
+
+static void HUF_free_dtable(HUF_dtable *dtable) {
+    free(dtable->symbols);
+    free(dtable->num_bits);
+    memset(dtable, 0, sizeof(HUF_dtable));
+}
+
+static void HUF_copy_dtable(HUF_dtable *dst, const HUF_dtable *src) {
+    if (src->max_bits == 0) {
+        memset(dst, 0, sizeof(HUF_dtable));
+        return;
+    }
+
+    size_t size = (size_t)1 << src->max_bits;
+    dst->max_bits = src->max_bits;
+
+    dst->symbols = malloc(size);
+    dst->num_bits = malloc(size);
+    if (!dst->symbols || !dst->num_bits) {
+        BAD_ALLOC();
+    }
+
+    memcpy(dst->symbols, src->symbols, size);
+    memcpy(dst->num_bits, src->num_bits, size);
+}
+/******* END HUFFMAN PRIMITIVES ***********************************************/
+
+/******* FSE PRIMITIVES *******************************************************/
+static inline u8 FSE_peek_symbol(FSE_dtable *dtable, u16 state) {
+    return dtable->symbols[state];
+}
+
+static inline void FSE_update_state(FSE_dtable *dtable, u16 *state,
+                                    const u8 *src, i64 *offset) {
+    const u8 bits = dtable->num_bits[*state];
+    const u16 rest = STREAM_read_bits(src, bits, offset);
+    *state = dtable->new_state_base[*state] + rest;
+}
+
+// Decodes a single FSE symbol and updates the offset
+static inline u8 FSE_decode_symbol(FSE_dtable *dtable, u16 *state,
+                                   const u8 *src, i64 *offset) {
+    const u8 symb = FSE_peek_symbol(dtable, *state);
+    FSE_update_state(dtable, state, src, offset);
+    return symb;
+}
+
+static inline void FSE_init_state(FSE_dtable *dtable, u16 *state, const u8 *src,
+                                  i64 *offset) {
+    const u8 bits = dtable->accuracy_log;
+    *state = STREAM_read_bits(src, bits, offset);
+}
+
+static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
+                                          size_t dst_len, const u8 *src,
+                                          size_t src_len) {
+    if (src_len == 0) {
+        INP_SIZE();
+    }
+
+    u8 *dst_max = dst + dst_len;
+    u8 *const odst = dst;
+
+    // Find the last 1 bit
+    int padding = 8 - log2inf(src[src_len - 1]);
+
+    i64 offset = src_len * 8 - padding;
+
+    u16 state1, state2;
+    FSE_init_state(dtable, &state1, src, &offset);
+    FSE_init_state(dtable, &state2, src, &offset);
+
+    // Decode until we overflow the stream
+    // Since we decode in reverse order, overflowing the stream is offset going
+    // negative
+    while (1) {
+        if (dst > dst_max - 2) {
+            OUT_SIZE();
+        }
+        *dst++ = FSE_decode_symbol(dtable, &state1, src, &offset);
+        if (offset < 0) {
+            // There's still a symbol to decode in state2
+            *dst++ = FSE_decode_symbol(dtable, &state2, src, &offset);
+            break;
+        }
+
+        if (dst > dst_max - 2) {
+            OUT_SIZE();
+        }
+        *dst++ = FSE_decode_symbol(dtable, &state2, src, &offset);
+        if (offset < 0) {
+            // There's still a symbol to decode in state1
+            *dst++ = FSE_decode_symbol(dtable, &state1, src, &offset);
+            break;
+        }
+    }
+
+    // number of symbols read
+    return dst - odst;
+}
+
+static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
+                            int num_symbs, int accuracy_log) {
+    if (accuracy_log > FSE_MAX_ACCURACY_LOG) {
+        ERROR("FSE accuracy too large");
+    }
+    if (num_symbs > FSE_MAX_SYMBS) {
+        ERROR("Too many symbols for FSE");
+    }
+
+    dtable->accuracy_log = accuracy_log;
+
+    size_t size = (size_t)1 << accuracy_log;
+    dtable->symbols = malloc(size * sizeof(u8));
+    dtable->num_bits = malloc(size * sizeof(u8));
+    dtable->new_state_base = malloc(size * sizeof(u16));
+
+    // Used to determine how many bits need to be read for each state,
+    // and where the destination range should start
+    // Needs to be u16 because max value is 2 * max number of symbols,
+    // which can be larger than a byte can store
+    u16 state_desc[FSE_MAX_SYMBS];
+
+    int high_threshold = size;
+    for (int s = 0; s < num_symbs; s++) {
+        // Scan for low probability symbols to put at the top
+        if (norm_freqs[s] == -1) {
+            dtable->symbols[--high_threshold] = s;
+            state_desc[s] = 1;
+        }
+    }
+
+    // Place the rest in the table
+    u16 step = (size >> 1) + (size >> 3) + 3;
+    u16 mask = size - 1;
+    u16 pos = 0;
+    for (int s = 0; s < num_symbs; s++) {
+        if (norm_freqs[s] <= 0) {
+            continue;
+        }
+
+        state_desc[s] = norm_freqs[s];
+
+        for (int i = 0; i < norm_freqs[s]; i++) {
+            dtable->symbols[pos] = s;
+            do {
+                pos = (pos + step) & mask;
+            } while (pos >=
+                     high_threshold); // Make sure we don't occupy a spot taken
+                                      // by the low prob symbols
+            // Note: no other collision checking is necessary as `step` is
+            // coprime to
+            // `size`, so the cycle will visit each position exactly once
+        }
+    }
+    if (pos != 0) {
+        CORRUPTION();
+    }
+
+    // Now we can fill baseline and num bits
+    for (int i = 0; i < size; i++) {
+        u8 symbol = dtable->symbols[i];
+        u16 next_state_desc = state_desc[symbol]++;
+        // Fills in the table appropriately
+        // next_state_desc increases by symbol over time, decreasing number of
+        // bits
+        dtable->num_bits[i] = (u8)(accuracy_log - log2inf(next_state_desc));
+        // baseline increases until the bit threshold is passed, at which point
+        // it
+        // resets to 0
+        dtable->new_state_base[i] =
+            ((u16)next_state_desc << dtable->num_bits[i]) - size;
+    }
+}
+
+static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
+                                size_t src_len, int max_accuracy_log) {
+    if (max_accuracy_log > FSE_MAX_ACCURACY_LOG) {
+        ERROR("FSE accuracy too large");
+    }
+    if (src_len < 1) {
+        INP_SIZE();
+    }
+
+    int accuracy_log = 5 + read_bits_LE(src, 4, 0);
+    if (accuracy_log > max_accuracy_log) {
+        ERROR("FSE accuracy too large");
+    }
+
+    // The +1 facilitates the `-1` probabilities
+    i32 remaining = (1 << accuracy_log) + 1;
+    i16 frequencies[FSE_MAX_SYMBS];
+
+    int symb = 0;
+    size_t offset = 4;
+    while (remaining > 1 && symb < FSE_MAX_SYMBS) {
+        int bits = log2sup(remaining +
+                           1); // the number of possible values we could read
+        u16 val = read_bits_LE(src, bits, offset);
+        offset += bits;
+
+        // try to mask out the lower bits to see if it qualifies for the "small
+        // value" threshold
+        u16 lower_mask = ((u16)1 << (bits - 1)) - 1;
+        u16 threshold = ((u16)1 << bits) - 1 - remaining;
+
+        if ((val & lower_mask) < threshold) {
+            offset--;
+            val = val & lower_mask;
+        } else if (val > lower_mask) {
+            val = val - threshold;
+        }
+
+        i16 proba = (i16)val - 1;
+        // a value of -1 is possible, and has special meaning
+        remaining -= proba < 0 ? -proba : proba;
+
+        frequencies[symb] = proba;
+        symb++;
+
+        // Handle the special probability = 0 case
+        if (proba == 0) {
+            // read the next two bits to see how many more 0s
+            int repeat = read_bits_LE(src, 2, offset);
+            offset += 2;
+
+            while (1) {
+                for (int i = 0; i < repeat && symb < FSE_MAX_SYMBS; i++) {
+                    frequencies[symb++] = 0;
+                }
+                if (repeat == 3) {
+                    repeat = read_bits_LE(src, 2, offset);
+                    offset += 2;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+
+    if (remaining != 1 || symb >= FSE_MAX_SYMBS) {
+        CORRUPTION();
+    }
+
+    // Initialize the decoding table using the determined weights
+    FSE_init_dtable(dtable, frequencies, symb, accuracy_log);
+
+    return (offset + 7) / 8;
+}
+
+static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb) {
+    dtable->symbols = malloc(sizeof(u8));
+    dtable->num_bits = malloc(sizeof(u8));
+    dtable->new_state_base = malloc(sizeof(u16));
+
+    // This setup will always have a state of 0, always return symbol `symb`,
+    // and
+    // never consume any bits
+    dtable->symbols[0] = symb;
+    dtable->num_bits[0] = 0;
+    dtable->new_state_base[0] = 0;
+    dtable->accuracy_log = 0;
+}
+
+static void FSE_free_dtable(FSE_dtable *dtable) {
+    free(dtable->symbols);
+    free(dtable->num_bits);
+    free(dtable->new_state_base);
+    memset(dtable, 0, sizeof(FSE_dtable));
+}
+
+static void FSE_copy_dtable(FSE_dtable *dst, const FSE_dtable *src) {
+    if (src->accuracy_log == 0) {
+        memset(dst, 0, sizeof(FSE_dtable));
+        return;
+    }
+
+    size_t size = (size_t)1 << src->accuracy_log;
+    dst->accuracy_log = src->accuracy_log;
+
+    dst->symbols = malloc(size);
+    dst->num_bits = malloc(size);
+    dst->new_state_base = malloc(size * sizeof(u16));
+    if (!dst->symbols || !dst->num_bits || !dst->new_state_base) {
+        BAD_ALLOC();
+    }
+
+    memcpy(dst->symbols, src->symbols, size);
+    memcpy(dst->num_bits, src->num_bits, size);
+    memcpy(dst->new_state_base, src->new_state_base, size * sizeof(u16));
+}
+/******* END FSE PRIMITIVES ***************************************************/
+
diff --git a/contrib/educational_decoder/zstd_decompress.h b/contrib/educational_decoder/zstd_decompress.h
new file mode 100644
index 00000000..3671678b
--- /dev/null
+++ b/contrib/educational_decoder/zstd_decompress.h
@@ -0,0 +1,6 @@
+size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
+                       size_t src_len);
+size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
+                                 size_t src_len, const void *dict,
+                                 size_t dict_len);
+

From 5657e0e07d4ecb23300390c0e95d4f9ec4ca1d66 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Mon, 30 Jan 2017 14:42:21 -0800
Subject: [PATCH 02/15] Added ZSTD_get_decompressed_size

Since this implementation handles multiple concatenated frames,
to determine decompressed size we must traverse the entire input,
checking each frame's frame_content_size field
---
 contrib/educational_decoder/harness.c         | 127 ++++----
 contrib/educational_decoder/zstd_decompress.c | 298 +++++++++++++-----
 contrib/educational_decoder/zstd_decompress.h |   1 +
 3 files changed, 293 insertions(+), 133 deletions(-)

diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
index 6f4765d9..c44100ff 100644
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@@ -5,8 +5,8 @@
 
 typedef unsigned char u8;
 
-// There's no good way to determine output size without decompressing
-// For this example assume we'll never decompress at a ratio larger than 16
+// If the data doesn't have decompressed size with it, fallback on assuming the
+// compression ratio is at most 16
 #define MAX_COMPRESSION_RATIO (16)
 
 u8 *input;
@@ -14,80 +14,89 @@ u8 *output;
 u8 *dict;
 
 size_t read_file(const char *path, u8 **ptr) {
-  FILE *f = fopen(path, "rb");
-  if (!f) {
-    fprintf(stderr, "failed to open file %s\n", path);
-    exit(1);
-  }
-
-  fseek(f, 0L, SEEK_END);
-  size_t size = ftell(f);
-  rewind(f);
-
-  *ptr = malloc(size);
-  if (!ptr) {
-    fprintf(stderr, "failed to allocate memory to hold %s\n", path);
-    exit(1);
-  }
-
-  size_t pos = 0;
-  while (!feof(f)) {
-    size_t read = fread(&(*ptr)[pos], 1, size, f);
-    if (ferror(f)) {
-      fprintf(stderr, "error while reading file %s\n", path);
-      exit(1);
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "failed to open file %s\n", path);
+        exit(1);
     }
-    pos += read;
-  }
 
-  fclose(f);
+    fseek(f, 0L, SEEK_END);
+    size_t size = ftell(f);
+    rewind(f);
 
-  return pos;
+    *ptr = malloc(size);
+    if (!ptr) {
+        fprintf(stderr, "failed to allocate memory to hold %s\n", path);
+        exit(1);
+    }
+
+    size_t pos = 0;
+    while (!feof(f)) {
+        size_t read = fread(&(*ptr)[pos], 1, size, f);
+        if (ferror(f)) {
+            fprintf(stderr, "error while reading file %s\n", path);
+            exit(1);
+        }
+        pos += read;
+    }
+
+    fclose(f);
+
+    return pos;
 }
 
 void write_file(const char *path, const u8 *ptr, size_t size) {
-  FILE *f = fopen(path, "wb");
+    FILE *f = fopen(path, "wb");
 
-  size_t written = 0;
-  while (written < size) {
-    written += fwrite(&ptr[written], 1, size, f);
-    if (ferror(f)) {
-      fprintf(stderr, "error while writing file %s\n", path);
-      exit(1);
+    size_t written = 0;
+    while (written < size) {
+        written += fwrite(&ptr[written], 1, size, f);
+        if (ferror(f)) {
+            fprintf(stderr, "error while writing file %s\n", path);
+            exit(1);
+        }
     }
-  }
 
-  fclose(f);
+    fclose(f);
 }
 
 int main(int argc, char **argv) {
-  if (argc < 3) {
-    fprintf(stderr, "usage: %s <file.zst> <out_path> [dictionary]\n", argv[0]);
+    if (argc < 3) {
+        fprintf(stderr, "usage: %s <file.zst> <out_path> [dictionary]\n",
+                argv[0]);
 
-    return 1;
-  }
+        return 1;
+    }
 
-  size_t input_size = read_file(argv[1], &input);
-  size_t dict_size = 0;
-  if (argc >= 4) {
-    dict_size = read_file(argv[3], &dict);
-  }
+    size_t input_size = read_file(argv[1], &input);
+    size_t dict_size = 0;
+    if (argc >= 4) {
+        dict_size = read_file(argv[3], &dict);
+    }
 
-  output = malloc(MAX_COMPRESSION_RATIO * input_size);
-  if (!output) {
-    fprintf(stderr, "failed to allocate memory\n");
-    return 1;
-  }
+    size_t decompressed_size = ZSTD_get_decompressed_size(input, input_size);
+    if (decompressed_size == -1) {
+        decompressed_size = MAX_COMPRESSION_RATIO * input_size;
+        fprintf(stderr, "WARNING: Compressed data does contain decompressed "
+                        "size, going to assume the compression ratio is at "
+                        "most %d (decompressed size of at most %lld\n",
+                MAX_COMPRESSION_RATIO, decompressed_size);
+    }
+    output = malloc(decompressed_size);
+    if (!output) {
+        fprintf(stderr, "failed to allocate memory\n");
+        return 1;
+    }
 
-  size_t decompressed =
-      ZSTD_decompress_with_dict(output, input_size * MAX_COMPRESSION_RATIO,
-                                input, input_size, dict, dict_size);
+    size_t decompressed =
+        ZSTD_decompress_with_dict(output, input_size * MAX_COMPRESSION_RATIO,
+                                  input, input_size, dict, dict_size);
 
-  write_file(argv[2], output, decompressed);
+    write_file(argv[2], output, decompressed);
 
-  free(input);
-  free(output);
-  free(dict);
-  input = output = dict = NULL;
+    free(input);
+    free(output);
+    free(dict);
+    input = output = dict = NULL;
 }
 
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 8dc15900..7b04c4b2 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -16,6 +16,10 @@ size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
                                  size_t src_len, const void *dict,
                                  size_t dict_len);
 
+/// Get the decompressed size of an input stream so memory can be allocated in
+/// advance
+size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
+
 /******* UTILITY MACROS AND TYPES *********************************************/
 #define MAX_WINDOW_SIZE ((size_t)512 << 20)
 // Max block size decompressed size is 128 KB and literal blocks must be smaller
@@ -232,10 +236,30 @@ typedef struct {
     size_t src_len;
 } io_streams_t;
 
+/// A small structure that can be reused in various places that need to access
+/// frame header information
+typedef struct {
+    // The size of window that we need to be able to contiguously store for
+    // references
+    size_t window_size;
+    // The total output size of this compressed frame
+    size_t frame_content_size;
+
+    // The dictionary id if this frame uses one
+    u32 dictionary_id;
+
+    // Whether or not the content of this frame has a checksum
+    int content_checksum_flag;
+    // Whether or not the output for this frame is in a single segment
+    int single_segment_flag;
+
+    // The size in bytes of this header
+    int header_size;
+} frame_header_t;
+
 /// The context needed to decode blocks in a frame
 typedef struct {
-    size_t window_size;
-    size_t frame_content_size;
+    frame_header_t header;
 
     // The total amount of data available for backreferences, to determine if an
     // offset too large to be correct
@@ -255,12 +279,6 @@ typedef struct {
     // The last 3 offsets for the special "repeat offsets".  Array size is 4 so
     // that previous_offsets[1] corresponds to the most recent offset
     u64 previous_offsets[4];
-
-    // The dictionary id for this frame if one exists
-    u32 dictionary_id;
-
-    int single_segment_flag;
-    int content_checksum_flag;
 } frame_context_t;
 
 /// The decoded contents of a dictionary so that it doesn't have to be repeated
@@ -364,10 +382,11 @@ size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
 /******* FRAME DECODING ******************************************************/
 
 static void decode_data_frame(io_streams_t *streams, dictionary_t *dict);
-static void init_frame_context(frame_context_t *context);
-static void free_frame_context(frame_context_t *context);
-static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
+static void init_frame_context(io_streams_t *streams, frame_context_t *context,
                                dictionary_t *dict);
+static void free_frame_context(frame_context_t *context);
+static void parse_frame_header(frame_header_t *header, const u8 *src,
+                               size_t src_len);
 static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict);
 
 static void decompress_data(io_streams_t *streams, frame_context_t *ctx);
@@ -411,12 +430,10 @@ static void decode_data_frame(io_streams_t *streams, dictionary_t *dict) {
     frame_context_t ctx;
 
     // Initialize the context that needs to be carried from block to block
-    init_frame_context(&ctx);
-    parse_frame_header(streams, &ctx, dict);
-    frame_context_apply_dict(&ctx, dict);
+    init_frame_context(streams, &ctx, dict);
 
-    if (ctx.frame_content_size != 0 &&
-        ctx.frame_content_size > streams->dst_len) {
+    if (ctx.header.frame_content_size != 0 &&
+        ctx.header.frame_content_size > streams->dst_len) {
         OUT_SIZE();
     }
 
@@ -425,13 +442,40 @@ static void decode_data_frame(io_streams_t *streams, dictionary_t *dict) {
     free_frame_context(&ctx);
 }
 
-static void init_frame_context(frame_context_t *context) {
+/// Takes the information provided in the header and dictionary, and initializes
+/// the context for this frame
+static void init_frame_context(io_streams_t *streams, frame_context_t *context,
+                               dictionary_t *dict) {
     memset(context, 0x00, sizeof(frame_context_t));
 
+    // Parse data from the frame header
+    parse_frame_header(&context->header, streams->src, streams->src_len);
+    streams->src += context->header.header_size;
+    streams->src_len -= context->header.header_size;
+
     // Set up the offset history for the repeat offset commands
     context->previous_offsets[1] = 1;
     context->previous_offsets[2] = 4;
     context->previous_offsets[3] = 8;
+
+    {
+        // Allocate the window buffer
+        size_t buffer_size;
+        if (context->header.single_segment_flag) {
+            buffer_size = context->header.frame_content_size +
+                          (dict ? dict->content_size : 0);
+        } else {
+            buffer_size = context->header.window_size;
+        }
+
+        if (buffer_size > MAX_WINDOW_SIZE) {
+            ERROR("Requested window size too large");
+        }
+        cbuf_init(&context->window, buffer_size);
+    }
+
+    // Apply details from the dict if it exists
+    frame_context_apply_dict(context, dict);
 }
 
 static void free_frame_context(frame_context_t *context) {
@@ -446,13 +490,13 @@ static void free_frame_context(frame_context_t *context) {
     memset(context, 0, sizeof(frame_context_t));
 }
 
-static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
-                               dictionary_t *dict) {
-    if (streams->src_len < 1) {
+static void parse_frame_header(frame_header_t *header, const u8 *src,
+                               size_t src_len) {
+    if (src_len < 1) {
         INP_SIZE();
     }
 
-    u8 descriptor = read_bits_LE(streams->src, 8, 0);
+    u8 descriptor = read_bits_LE(src, 8, 0);
 
     // decode frame header descriptor into flags
     u8 frame_content_size_flag = descriptor >> 6;
@@ -465,30 +509,28 @@ static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
         CORRUPTION();
     }
 
-    streams->src++;
-    streams->src_len--;
+    int header_size = 1;
 
-    ctx->single_segment_flag = single_segment_flag;
-    ctx->content_checksum_flag = content_checksum_flag;
+    header->single_segment_flag = single_segment_flag;
+    header->content_checksum_flag = content_checksum_flag;
 
     // decode window size
     if (!single_segment_flag) {
-        if (streams->src_len < 1) {
+        if (src_len < header_size + 1) {
             INP_SIZE();
         }
 
         // Use the algorithm from the specification to compute window size
         // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
-        u8 window_descriptor = read_bits_LE(streams->src, 8, 0);
+        u8 window_descriptor = src[header_size];
         u8 exponent = window_descriptor >> 3;
         u8 mantissa = window_descriptor & 7;
 
         size_t window_base = (size_t)1 << (10 + exponent);
         size_t window_add = (window_base / 8) * mantissa;
-        ctx->window_size = window_base + window_add;
+        header->window_size = window_base + window_add;
 
-        streams->src++;
-        streams->src_len--;
+        header_size++;
     }
 
     // decode dictionary id if it exists
@@ -496,52 +538,40 @@ static void parse_frame_header(io_streams_t *streams, frame_context_t *ctx,
         const int bytes_array[] = {0, 1, 2, 4};
         const int bytes = bytes_array[dictionary_id_flag];
 
-        if (streams->src_len < bytes) {
+        if (src_len < header_size + bytes) {
             INP_SIZE();
         }
 
-        ctx->dictionary_id = read_bits_LE(streams->src, bytes * 8, 0);
-        streams->src += bytes;
-        streams->src_len -= bytes;
+        header->dictionary_id = read_bits_LE(src + header_size, bytes * 8, 0);
+
+        header_size += bytes;
     } else {
-        ctx->dictionary_id = 0;
+        header->dictionary_id = 0;
     }
 
     // decode frame content size if it exists
     if (single_segment_flag || frame_content_size_flag) {
         // if frame_content_size_flag == 0 but single_segment_flag is set, we
-        // still
-        // have a 1 byte field
+        // still have a 1 byte field
         const int bytes_array[] = {1, 2, 4, 8};
         const int bytes = bytes_array[frame_content_size_flag];
 
-        if (streams->src_len < bytes) {
+        if (src_len < header_size + bytes) {
             INP_SIZE();
         }
 
-        ctx->frame_content_size = read_bits_LE(streams->src, bytes * 8, 0);
+        header->frame_content_size =
+            read_bits_LE(src + header_size, bytes * 8, 0);
         if (bytes == 2) {
-            ctx->frame_content_size += 256;
+            header->frame_content_size += 256;
         }
 
-        streams->src += bytes;
-        streams->src_len -= bytes;
+        header_size += bytes;
+    } else {
+        header->frame_content_size = 0;
     }
 
-    if (single_segment_flag) {
-        ctx->window_size =
-            ctx->frame_content_size + (dict ? dict->content_size : 0);
-        // We need to allocate a buffer to write to of size at least output +
-        // dict
-        // size
-        size_t size = ctx->frame_content_size + (dict ? dict->content_size : 0);
-    }
-
-    // Allocate the window
-    if (ctx->window_size > MAX_WINDOW_SIZE) {
-        ERROR("Requested window size too large");
-    }
-    cbuf_init(&ctx->window, ctx->window_size);
+    header->header_size = header_size;
 }
 
 /// A dictionary acts as initializing values for the frame context before
@@ -552,7 +582,7 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
     if (!dict || !dict->content)
         return;
 
-    if (ctx->dictionary_id == 0 && dict->dictionary_id != 0) {
+    if (ctx->header.dictionary_id == 0 && dict->dictionary_id != 0) {
         // The dictionary is unneeded, and shouldn't be used as it may interfere
         // with the default offset history
         return;
@@ -560,7 +590,8 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
 
     // If the dictionary id is 0, it doesn't matter if we provide the wrong raw
     // content dict, it won't change anything
-    if (ctx->dictionary_id != 0 && ctx->dictionary_id != dict->dictionary_id) {
+    if (ctx->header.dictionary_id != 0 &&
+        ctx->header.dictionary_id != dict->dictionary_id) {
         ERROR("Wrong/no dictionary provided");
     }
 
@@ -575,8 +606,7 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
     // be used in the table repeat modes
     if (dict->dictionary_id != 0) {
         // Deep copy the entropy tables so they can be freed independently of
-        // the
-        // dictionary struct
+        // the dictionary struct
         HUF_copy_dtable(&ctx->literals_dtable, &dict->literals_dtable);
         FSE_copy_dtable(&ctx->ll_dtable, &dict->ll_dtable);
         FSE_copy_dtable(&ctx->of_dtable, &dict->of_dtable);
@@ -590,14 +620,14 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
 /// Decompress the data from a frame block by block
 static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
 
-    u8 last_block = 0;
+    int last_block = 0;
     do {
         if (streams->src_len < 3) {
             INP_SIZE();
         }
         // Parse the block header
         last_block = streams->src[0] & 1;
-        u8 block_type = (streams->src[0] >> 1) & 3;
+        int block_type = (streams->src[0] >> 1) & 3;
         size_t block_len = read_bits_LE(streams->src, 21, 3);
 
         streams->src += 3;
@@ -648,6 +678,10 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
             // Compressed block, this is mode complex
             decompress_block(streams, ctx, block_len);
             break;
+        case 3:
+            // Reserved block type
+            CORRUPTION();
+            break;
         }
     } while (!last_block);
 
@@ -656,10 +690,9 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
     streams->dst += written;
     streams->dst_len -= written;
 
-    if (ctx->content_checksum_flag) {
+    if (ctx->header.content_checksum_flag) {
         // This program does not support checking the checksum, so skip over it
-        // if
-        // it's present
+        // if it's present
         if (streams->src_len < 4) {
             INP_SIZE();
         }
@@ -1312,6 +1345,126 @@ static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
 }
 /******* END SEQUENCE EXECUTION ***********************************************/
 
+/******* OUTPUT SIZE COUNTING *************************************************/
+size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len);
+
+/// Get the decompressed size of an input stream so memory can be allocated in
+/// advance.
+/// This is more complex than the implementation in the reference
+/// implementation, as this API allows for the decompression of multiple
+/// concatenated frames.
+size_t ZSTD_get_decompressed_size(const void *src, size_t src_len) {
+  const u8 *ip = (const u8 *) src;
+  size_t dst_size = 0;
+
+  // Each frame header only gives us the size of its frame, so iterate over all
+  // frames
+  while (src_len > 0) {
+    if (src_len < 4) {
+      INP_SIZE();
+    }
+
+    u32 magic_number = read_bits_LE(ip, 32, 0);
+
+    ip += 4;
+    src_len -= 4;
+    if (magic_number >= 0x184D2A50U && magic_number <= 0x184D2A5F) {
+        // skippable frame, this has no impact on output size
+        if (src_len < 4) {
+            INP_SIZE();
+        }
+        size_t frame_size = read_bits_LE(ip, 32, 32);
+
+        if (src_len < 4 + frame_size) {
+            INP_SIZE();
+        }
+
+        // skip over frame
+        ip += 4 + frame_size;
+        src_len -= 4 + frame_size;
+    } else if (magic_number == 0xFD2FB528U) {
+        // ZSTD frame
+        frame_header_t header;
+        parse_frame_header(&header, ip, src_len);
+
+        if (header.frame_content_size == 0 && !header.single_segment_flag) {
+            // Content size not provided, we can't tell
+            return -1;
+        }
+
+        dst_size += header.frame_content_size;
+
+        // we need to traverse the frame to find when the next one starts
+        size_t traversed = traverse_frame(&header, ip, src_len);
+        ip += traversed;
+        src_len -= traversed;
+    } else {
+        // not a real frame
+        ERROR("Invalid magic number");
+    }
+  }
+
+  return dst_size;
+}
+
+/// Iterate over each block in a frame to find the end of it, to get to the
+/// start of the next frame
+size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len) {
+    const u8 *const src_beg = src;
+    const u8 *const src_end = src + src_len;
+    src += header->header_size;
+    src_len += header->header_size;
+
+    int last_block = 0;
+
+    do {
+        if (src + 3 > src_end) {
+            INP_SIZE();
+        }
+        // Parse the block header
+        last_block = src[0] & 1;
+        int block_type = (src[0] >> 1) & 3;
+        size_t block_len = read_bits_LE(src, 21, 3);
+
+        src += 3;
+        switch (block_type) {
+        case 0: // Raw block, block_len bytes
+            if (src + block_len > src_end) {
+                INP_SIZE();
+            }
+            src += block_len;
+            break;
+        case 1: // RLE block, 1 byte
+            if (src + 1 > src_end) {
+                INP_SIZE();
+            }
+            src++;
+            break;
+        case 2: // Compressed block, compressed size is block_len
+            if (src + block_len > src_end) {
+                INP_SIZE();
+            }
+            src += block_len;
+            break;
+        case 3:
+            // Reserved block type
+            CORRUPTION();
+            break;
+        }
+    } while (!last_block);
+
+    if (header->content_checksum_flag) {
+        if (src + 4 > src_end) {
+            INP_SIZE();
+        }
+        src += 4;
+    }
+
+    return src - src_beg;
+}
+
+/******* END OUTPUT SIZE COUNTING *********************************************/
+
 /******* DICTIONARY PARSING ***************************************************/
 static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
                                   size_t src_len);
@@ -1952,8 +2105,8 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
                      high_threshold); // Make sure we don't occupy a spot taken
                                       // by the low prob symbols
             // Note: no other collision checking is necessary as `step` is
-            // coprime to
-            // `size`, so the cycle will visit each position exactly once
+            // coprime to `size`, so the cycle will visit each position exactly
+            // once
         }
     }
     if (pos != 0) {
@@ -1964,13 +2117,11 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
     for (int i = 0; i < size; i++) {
         u8 symbol = dtable->symbols[i];
         u16 next_state_desc = state_desc[symbol]++;
-        // Fills in the table appropriately
-        // next_state_desc increases by symbol over time, decreasing number of
-        // bits
+        // Fills in the table appropriately next_state_desc increases by symbol
+        // over time, decreasing number of bits
         dtable->num_bits[i] = (u8)(accuracy_log - log2inf(next_state_desc));
         // baseline increases until the bit threshold is passed, at which point
-        // it
-        // resets to 0
+        // it resets to 0
         dtable->new_state_base[i] =
             ((u16)next_state_desc << dtable->num_bits[i]) - size;
     }
@@ -2057,8 +2208,7 @@ static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb) {
     dtable->new_state_base = malloc(sizeof(u16));
 
     // This setup will always have a state of 0, always return symbol `symb`,
-    // and
-    // never consume any bits
+    // and never consume any bits
     dtable->symbols[0] = symb;
     dtable->num_bits[0] = 0;
     dtable->new_state_base[0] = 0;
diff --git a/contrib/educational_decoder/zstd_decompress.h b/contrib/educational_decoder/zstd_decompress.h
index 3671678b..3e1bc568 100644
--- a/contrib/educational_decoder/zstd_decompress.h
+++ b/contrib/educational_decoder/zstd_decompress.h
@@ -3,4 +3,5 @@ size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
 size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
                                  size_t src_len, const void *dict,
                                  size_t dict_len);
+size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
 

From f231626244f857bb18b3459f8d6ac143c6f65da6 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Mon, 30 Jan 2017 14:57:02 -0800
Subject: [PATCH 03/15] Minor fixes according to comments

- Add Facebook copyright notice
- Make max size macros more consistent
- Fix some unchecked malloc's
---
 contrib/educational_decoder/README.md         |  7 +++---
 contrib/educational_decoder/harness.c         |  9 +++++++
 contrib/educational_decoder/zstd_decompress.c | 24 +++++++++++++++++--
 contrib/educational_decoder/zstd_decompress.h |  9 +++++++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/contrib/educational_decoder/README.md b/contrib/educational_decoder/README.md
index a1f703f6..2e2186e0 100644
--- a/contrib/educational_decoder/README.md
+++ b/contrib/educational_decoder/README.md
@@ -1,15 +1,16 @@
 Educational Decoder
 ===================
 
-`zstd_decompress.c` is a self-contained implementation of a decoder according
-to the Zstandard format specification written in C99.
+`zstd_decompress.c` is a self-contained implementation in C99 of a decoder,
+according to the [Zstandard format specification].
 While it does not implement as many features as the reference decoder,
 such as the streaming API or content checksums, it is written to be easy to
 follow and understand, to help understand how the Zstandard format works.
 It's laid out to match the [format specification],
-so it can be used to understand how confusing segments could be implemented.
+so it can be used to understand how complex segments could be implemented.
 It also contains implementations of Huffman and FSE table decoding.
 
+[Zstandard format specification]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
 [format specification]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
 
 `harness.c` provides a simple test harness around the decoder:
diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
index c44100ff..42424d4b 100644
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@@ -1,3 +1,12 @@
+/*
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 7b04c4b2..79fd2685 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -1,3 +1,12 @@
+/*
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
 /// Zstandard educational decoder implementation
 /// See https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
 
@@ -21,10 +30,13 @@ size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
 size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
 
 /******* UTILITY MACROS AND TYPES *********************************************/
-#define MAX_WINDOW_SIZE ((size_t)512 << 20)
+// Specification recommends supporting at least 8MB.  The maximum possible value
+// is 1.875TB, but this implementation limits it to 512MB to avoid allocating
+// too much memory.
+#define MAX_WINDOW_SIZE ((size_t)512 * 1024 * 1024)
 // Max block size decompressed size is 128 KB and literal blocks must be smaller
 // than that
-#define MAX_LITERALS_SIZE ((size_t)(1024 * 128))
+#define MAX_LITERALS_SIZE ((size_t)128 * 1024)
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -2071,6 +2083,10 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
     dtable->num_bits = malloc(size * sizeof(u8));
     dtable->new_state_base = malloc(size * sizeof(u16));
 
+    if (!dtable->symbols || !dtable->num_bits || !dtable->new_state_base) {
+        BAD_ALLOC();
+    }
+
     // Used to determine how many bits need to be read for each state,
     // and where the destination range should start
     // Needs to be u16 because max value is 2 * max number of symbols,
@@ -2207,6 +2223,10 @@ static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb) {
     dtable->num_bits = malloc(sizeof(u8));
     dtable->new_state_base = malloc(sizeof(u16));
 
+    if (!dtable->symbols || !dtable->num_bits || !dtable->new_state_base) {
+        BAD_ALLOC();
+    }
+
     // This setup will always have a state of 0, always return symbol `symb`,
     // and never consume any bits
     dtable->symbols[0] = symb;
diff --git a/contrib/educational_decoder/zstd_decompress.h b/contrib/educational_decoder/zstd_decompress.h
index 3e1bc568..6e173672 100644
--- a/contrib/educational_decoder/zstd_decompress.h
+++ b/contrib/educational_decoder/zstd_decompress.h
@@ -1,3 +1,12 @@
+/*
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
 size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
                        size_t src_len);
 size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,

From f5d2f32d4dd34a88dcf68cc8929e3063d591654b Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Tue, 31 Jan 2017 15:54:02 -0800
Subject: [PATCH 04/15] Removed circular buffer, matches access destination
 buffer directly

---
 contrib/educational_decoder/zstd_decompress.c | 196 +++++++-----------
 1 file changed, 75 insertions(+), 121 deletions(-)

diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 79fd2685..90d4a522 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -30,10 +30,6 @@ size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
 size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
 
 /******* UTILITY MACROS AND TYPES *********************************************/
-// Specification recommends supporting at least 8MB.  The maximum possible value
-// is 1.875TB, but this implementation limits it to 512MB to avoid allocating
-// too much memory.
-#define MAX_WINDOW_SIZE ((size_t)512 * 1024 * 1024)
 // Max block size decompressed size is 128 KB and literal blocks must be smaller
 // than that
 #define MAX_LITERALS_SIZE ((size_t)128 * 1024)
@@ -69,43 +65,6 @@ typedef int64_t i64;
 /// file.  They implement low-level functionality needed for the higher level
 /// decompression functions.
 
-/*** CIRCULAR BUFFER ******************/
-/// A standard circular buffer, used to facilitate back reference commands
-typedef struct {
-    u8 *ptr;
-    size_t idx, last_flush, size;
-} cbuf_t;
-
-/// Initialize a circular buffer
-static void cbuf_init(cbuf_t *buf, size_t size);
-static void cbuf_free(cbuf_t *buf);
-
-/// Copies up to `src_len` bytes from `src` into the buffer, stopping if it
-/// would need to flush.
-/// Returns the total amount of data copied.
-static size_t cbuf_write_data(cbuf_t *buf, const u8 *src, size_t src_len);
-/// Copies `len` bytes from `offset` back in the buffer, stopping if it would
-/// need to flush.
-/// Returns the number of bytes copied.
-static size_t cbuf_copy_offset(cbuf_t *buf, size_t offset, size_t len);
-/// Writes up to `len` copies of `byte`, stopping if would need to flush.
-/// Returns the number of bytes copied.
-static size_t cbuf_repeat_byte(cbuf_t *buf, u8 byte, size_t len);
-
-/// The `full` versions of the above functions write the full amount requested,
-/// flushing to `out` when necessary.
-/// They return the number of bytes flushed to `out`, if any.
-static size_t cbuf_write_data_full(cbuf_t *buf, const u8 *src, size_t src_len,
-                                   u8 *out, size_t out_len);
-static size_t cbuf_copy_offset_full(cbuf_t *buf, size_t offset, size_t len,
-                                    u8 *out, size_t out_len);
-static size_t cbuf_repeat_byte_full(cbuf_t *buf, u8 byte, size_t len, u8 *out,
-                                    size_t out_len);
-
-/// Flushes any unflushed data to `dst`
-static size_t cbuf_flush(cbuf_t *buf, u8 *dst, size_t dst_len);
-/*** END CIRCULAR BUFFER **************/
-
 /*** BITSTREAM OPERATIONS *************/
 /// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
 static inline u64 read_bits_LE(const u8 *src, int num, size_t offset);
@@ -277,12 +236,11 @@ typedef struct {
     // offset too large to be correct
     size_t current_total_output;
 
-    // A sliding window of the past `window_size` bytes decoded
-    cbuf_t window;
+    const u8 *dict_content;
+    size_t dict_content_len;
 
     // Entropy encoding tables so they can be repeated by future blocks instead
-    // of
-    // retransmitting
+    // of retransmitting
     HUF_dtable literals_dtable;
     FSE_dtable ll_dtable;
     FSE_dtable ml_dtable;
@@ -470,22 +428,6 @@ static void init_frame_context(io_streams_t *streams, frame_context_t *context,
     context->previous_offsets[2] = 4;
     context->previous_offsets[3] = 8;
 
-    {
-        // Allocate the window buffer
-        size_t buffer_size;
-        if (context->header.single_segment_flag) {
-            buffer_size = context->header.frame_content_size +
-                          (dict ? dict->content_size : 0);
-        } else {
-            buffer_size = context->header.window_size;
-        }
-
-        if (buffer_size > MAX_WINDOW_SIZE) {
-            ERROR("Requested window size too large");
-        }
-        cbuf_init(&context->window, buffer_size);
-    }
-
     // Apply details from the dict if it exists
     frame_context_apply_dict(context, dict);
 }
@@ -497,8 +439,6 @@ static void free_frame_context(frame_context_t *context) {
     FSE_free_dtable(&context->ml_dtable);
     FSE_free_dtable(&context->of_dtable);
 
-    cbuf_free(&context->window);
-
     memset(context, 0, sizeof(frame_context_t));
 }
 
@@ -583,6 +523,13 @@ static void parse_frame_header(frame_header_t *header, const u8 *src,
         header->frame_content_size = 0;
     }
 
+    if (single_segment_flag) {
+        // in this case the effective window size is frame_content_size this
+        // impacts sequence decoding as we need to determine whether to fall
+        // back to the dictionary or not on large offsets
+        header->window_size = header->frame_content_size;
+    }
+
     header->header_size = header_size;
 }
 
@@ -607,12 +554,9 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
         ERROR("Wrong/no dictionary provided");
     }
 
-    // Write the dict data in, and then flush to NULL so it's not sent to the
-    // output stream
-    cbuf_write_data_full(&ctx->window, dict->content, dict->content_size, NULL,
-                         -1);
-    cbuf_flush(&ctx->window, NULL, -1);
-    ctx->current_total_output = dict->content_size;
+    // Copy the pointer in so we can reference it in sequence execution
+    ctx->dict_content = dict->content;
+    ctx->dict_content_len = dict->content_size;
 
     // If it's a formatted dict copy the precomputed tables in so they can
     // be used in the table repeat modes
@@ -655,15 +599,16 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
                 OUT_SIZE();
             }
 
-            // Write the raw data into the window buffer
-            size_t written =
-                cbuf_write_data_full(&ctx->window, streams->src, block_len,
-                                     streams->dst, streams->dst_len);
+            // Copy the raw data into the output
+            memcpy(streams->dst, streams->src, block_len);
+
             streams->src += block_len;
             streams->src_len -= block_len;
 
-            streams->dst += written;
-            streams->dst_len -= written;
+            streams->dst += block_len;
+            streams->dst_len -= block_len;
+
+            ctx->current_total_output += block_len;
             break;
         }
         case 1: {
@@ -675,15 +620,16 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
                 OUT_SIZE();
             }
 
-            // Write streams->src[0] into the buffer block_len times
-            size_t written =
-                cbuf_repeat_byte_full(&ctx->window, streams->src[0], block_len,
-                                      streams->dst, streams->dst_len);
-            streams->dst += written;
-            streams->dst_len -= written;
+            // Copy `block_len` copies of `streams->src[0]` to the output
+            memset(streams->dst, streams->src[0], block_len);
+
+            streams->dst += block_len;
+            streams->dst_len -= block_len;
 
             streams->src += 1;
             streams->src_len -= 1;
+
+            ctx->current_total_output += block_len;
             break;
         }
         case 2:
@@ -697,11 +643,6 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
         }
     } while (!last_block);
 
-    // Flush out anything left in the window buffer to the destination stream
-    size_t written = cbuf_flush(&ctx->window, streams->dst, streams->dst_len);
-    streams->dst += written;
-    streams->dst_len -= written;
-
     if (ctx->header.content_checksum_flag) {
         // This program does not support checking the checksum, so skip over it
         // if it's present
@@ -1277,20 +1218,19 @@ static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
             CORRUPTION();
         }
 
-        {
-            // Copy literals to the buffer
-            size_t written =
-                cbuf_write_data_full(&ctx->window, literals, seq.literal_length,
-                                     streams->dst, streams->dst_len);
-
-            literals += seq.literal_length;
-            literals_len -= seq.literal_length;
-
-            streams->dst += written;
-            streams->dst_len -= written;
-
-            total_output += seq.literal_length;
+        if (streams->dst_len < seq.literal_length + seq.match_length) {
+            OUT_SIZE();
         }
+        // Copy literals to output
+        memcpy(streams->dst, literals, seq.literal_length);
+
+        literals += seq.literal_length;
+        literals_len -= seq.literal_length;
+
+        streams->dst += seq.literal_length;
+        streams->dst_len -= seq.literal_length;
+
+        total_output += seq.literal_length;
 
         size_t offset;
 
@@ -1324,36 +1264,50 @@ static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
             offset_hist[1] = offset;
         }
 
-        if (offset > total_output) {
-            CORRUPTION();
+        size_t match_length = seq.match_length;
+        if (total_output <= ctx->header.window_size) {
+            // In this case offset might go back into the dictionary
+            if (offset > total_output + ctx->dict_content_len) {
+                // The offset goes beyond even the dictionary
+                CORRUPTION();
+            }
+
+            if (offset > total_output) {
+                const size_t dict_copy =
+                    MIN(offset - total_output, match_length);
+                const size_t dict_offset =
+                    ctx->dict_content_len - (offset - total_output);
+                for (size_t i = 0; i < dict_copy; i++) {
+                    *streams->dst++ = ctx->dict_content[dict_offset + i];
+                }
+                match_length -= dict_copy;
+            }
         }
 
-        {
-            // Do the offset copy operation
-            size_t written =
-                cbuf_copy_offset_full(&ctx->window, offset, seq.match_length,
-                                      streams->dst, streams->dst_len);
-
-            streams->dst += written;
-            streams->dst_len -= written;
-            total_output += seq.match_length;
+        // We must copy byte by byte because the match length might be larger
+        // than the offset
+        // ex: if the output so far was "abc", a command with offset=3 and
+        // match_length=6 would produce "abcabcabc" as the new output
+        for (size_t i = 0; i < match_length; i++) {
+            *streams->dst = *(streams->dst - offset);
+            streams->dst++;
         }
+
+        streams->dst_len -= seq.match_length;
+        total_output += seq.match_length;
     }
 
-    {
-        // Copy any leftover literal bytes
-        size_t written =
-            cbuf_write_data_full(&ctx->window, literals, literals_len,
-                                 streams->dst, streams->dst_len);
-        streams->dst += written;
-        streams->dst_len -= written;
-
-        total_output += literals_len;
+    if (streams->dst_len < literals_len) {
+        OUT_SIZE();
     }
+    // Copy any leftover literals
+    memcpy(streams->dst, literals, literals_len);
+    streams->dst += literals_len;
+    streams->dst_len -= literals_len;
+
+    total_output += literals_len;
 
     ctx->current_total_output = total_output;
-
-    return total_output;
 }
 /******* END SEQUENCE EXECUTION ***********************************************/
 

From 92ec2ea62f34870d0f9900af68eb816845a3494c Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Tue, 31 Jan 2017 15:57:18 -0800
Subject: [PATCH 05/15] More const's and readability improvements

---
 contrib/educational_decoder/harness.c         |   2 +-
 contrib/educational_decoder/zstd_decompress.c | 860 ++++++++----------
 contrib/educational_decoder/zstd_decompress.h |  12 +-
 3 files changed, 410 insertions(+), 464 deletions(-)

diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
index 42424d4b..107a16a2 100644
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@@ -88,7 +88,7 @@ int main(int argc, char **argv) {
         decompressed_size = MAX_COMPRESSION_RATIO * input_size;
         fprintf(stderr, "WARNING: Compressed data does contain decompressed "
                         "size, going to assume the compression ratio is at "
-                        "most %d (decompressed size of at most %lld\n",
+                        "most %d (decompressed size of at most %zu)\n",
                 MAX_COMPRESSION_RATIO, decompressed_size);
     }
     output = malloc(decompressed_size);
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 90d4a522..3c1c5673 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -17,17 +17,17 @@
 
 /// Zstandard decompression functions.
 /// `dst` must point to a space at least as large as the reconstructed output.
-size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
-                       size_t src_len);
+size_t ZSTD_decompress(void *const dst, const size_t dst_len,
+                       const void *const src, const size_t src_len);
 /// If `dict != NULL` and `dict_len >= 8`, does the same thing as
 /// `ZSTD_decompress` but uses the provided dict
-size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
-                                 size_t src_len, const void *dict,
-                                 size_t dict_len);
+size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
+                                 const void *const src, const size_t src_len,
+                                 const void *const dict, const size_t dict_len);
 
 /// Get the decompressed size of an input stream so memory can be allocated in
 /// advance
-size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
+size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
 
 /******* UTILITY MACROS AND TYPES *********************************************/
 // Max block size decompressed size is 128 KB and literal blocks must be smaller
@@ -67,23 +67,21 @@ typedef int64_t i64;
 
 /*** BITSTREAM OPERATIONS *************/
 /// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
-static inline u64 read_bits_LE(const u8 *src, int num, size_t offset);
+static inline u64 read_bits_LE(const u8 *src, const int num,
+                               const size_t offset);
 
 /// Read bits from the end of a HUF or FSE bitstream.  `offset` is in bits, so
 /// it updates `offset` to `offset - bits`, and then reads `bits` bits from
 /// `src + offset`.  If the offset becomes negative, the extra bits at the
 /// bottom are filled in with `0` bits instead of reading from before `src`.
-static inline u64 STREAM_read_bits(const u8 *src, int bits, i64 *offset);
+static inline u64 STREAM_read_bits(const u8 *src, const int bits,
+                                   i64 *const offset);
 /*** END BITSTREAM OPERATIONS *********/
 
 /*** BIT COUNTING OPERATIONS **********/
-/// Returns `x`, where `2^x` is the smallest power of 2 greater than or equal to
-/// `num`, or `-1` if `num > 2^63`
-static inline int log2sup(u64 num);
-
 /// Returns `x`, where `2^x` is the largest power of 2 less than or equal to
 /// `num`, or `-1` if `num == 0`.
-static inline int log2inf(u64 num);
+static inline int log2inf(const u64 num);
 /*** END BIT COUNTING OPERATIONS ******/
 
 /*** HUFFMAN PRIMITIVES ***************/
@@ -101,36 +99,41 @@ typedef struct {
 } HUF_dtable;
 
 /// Decode a single symbol and read in enough bits to refresh the state
-static inline u8 HUF_decode_symbol(HUF_dtable *dtable, u16 *state,
-                                   const u8 *src, i64 *offset);
+static inline u8 HUF_decode_symbol(const HUF_dtable *const dtable,
+                                   u16 *const state, const u8 *const src,
+                                   i64 *const offset);
 /// Read in a full state's worth of bits to initialize it
-static inline void HUF_init_state(HUF_dtable *dtable, u16 *state, const u8 *src,
-                                  i64 *offset);
-
-/// Initialize a Huffman decoding table using the table of bit counts provided
-static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs);
-/// Initialize a Huffman decoding table using the table of weights provided
-/// Weights follow the definition provided in the Zstandard specification
-static void HUF_init_dtable_usingweights(HUF_dtable *table, u8 *weights,
-                                         int num_symbs);
+static inline void HUF_init_state(const HUF_dtable *const dtable,
+                                  u16 *const state, const u8 *const src,
+                                  i64 *const offset);
 
 /// Decompresses a single Huffman stream, returns the number of bytes decoded.
 /// `src_len` must be the exact length of the Huffman-coded block.
-static size_t HUF_decompress_1stream(HUF_dtable *table, u8 *dst, size_t dst_len,
-                                     const u8 *src, size_t src_len);
+static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, u8 *dst,
+                                     const size_t dst_len, const u8 *src,
+                                     size_t src_len);
 /// Same as previous but decodes 4 streams, formatted as in the Zstandard
 /// specification.
 /// `src_len` must be the exact length of the Huffman-coded block.
-static size_t HUF_decompress_4stream(HUF_dtable *dtable, u8 *dst,
-                                     size_t dst_len, const u8 *src,
-                                     size_t src_len);
+static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, u8 *dst,
+                                     const size_t dst_len, const u8 *const src,
+                                     const size_t src_len);
+
+/// Initialize a Huffman decoding table using the table of bit counts provided
+static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
+                            const int num_symbs);
+/// Initialize a Huffman decoding table using the table of weights provided
+/// Weights follow the definition provided in the Zstandard specification
+static void HUF_init_dtable_usingweights(HUF_dtable *const table,
+                                         const u8 *const weights,
+                                         const int num_symbs);
 
 /// Free the malloc'ed parts of a decoding table
-static void HUF_free_dtable(HUF_dtable *dtable);
+static void HUF_free_dtable(HUF_dtable *const dtable);
 
 /// Deep copy a decoding table, so that it can be used and free'd without
 /// impacting the source table.
-static void HUF_copy_dtable(HUF_dtable *dst, const HUF_dtable *src);
+static void HUF_copy_dtable(HUF_dtable *const dst, const HUF_dtable *const src);
 /*** END HUFFMAN PRIMITIVES ***********/
 
 /*** FSE PRIMITIVES *******************/
@@ -151,46 +154,53 @@ typedef struct {
 } FSE_dtable;
 
 /// Return the symbol for the current state
-static inline u8 FSE_peek_symbol(FSE_dtable *dtable, u16 state);
+static inline u8 FSE_peek_symbol(const FSE_dtable *const dtable,
+                                 const u16 state);
 /// Read the number of bits necessary to update state, update, and shift offset
 /// back to reflect the bits read
-static inline void FSE_update_state(FSE_dtable *dtable, u16 *state,
-                                    const u8 *src, i64 *offset);
+static inline void FSE_update_state(const FSE_dtable *const dtable,
+                                    u16 *const state, const u8 *const src,
+                                    i64 *const offset);
 
 /// Combine peek and update: decode a symbol and update the state
-static inline u8 FSE_decode_symbol(FSE_dtable *dtable, u16 *state,
-                                   const u8 *src, i64 *offset);
+static inline u8 FSE_decode_symbol(const FSE_dtable *const dtable,
+                                   u16 *const state, const u8 *const src,
+                                   i64 *const offset);
 
 /// Read bits from the stream to initialize the state and shift offset back
-static inline void FSE_init_state(FSE_dtable *dtable, u16 *state, const u8 *src,
-                                  i64 *offset);
+static inline void FSE_init_state(const FSE_dtable *const dtable,
+                                  u16 *const state, const u8 *const src,
+                                  i64 *const offset);
 
 /// Decompress two interleaved bitstreams (e.g. compressed Huffman weights)
 /// using an FSE decoding table.  `src_len` must be the exact length of the
 /// block.
-static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
-                                          size_t dst_len, const u8 *src,
-                                          size_t src_len);
+static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
+                                          u8 *dst, const size_t dst_len,
+                                          const u8 *const src,
+                                          const size_t src_len);
 
 /// Initialize a decoding table using normalized frequencies.
-static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
-                            int num_symbs, int accuracy_log);
+static void FSE_init_dtable(FSE_dtable *const dtable,
+                            const i16 *const norm_freqs, const int num_symbs,
+                            const int accuracy_log);
 
 /// Decode an FSE header as defined in the Zstandard format specification and
 /// use the decoded frequencies to initialize a decoding table.
-static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
-                                size_t src_len, int max_accuracy_log);
+static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
+                                const size_t src_len,
+                                const int max_accuracy_log);
 
 /// Initialize an FSE table that will always return the same symbol and consume
 /// 0 bits per symbol, to be used for RLE mode in sequence commands
-static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb);
+static void FSE_init_dtable_rle(FSE_dtable *const dtable, const u8 symb);
 
 /// Free the malloc'ed parts of a decoding table
-static void FSE_free_dtable(FSE_dtable *dtable);
+static void FSE_free_dtable(FSE_dtable *const dtable);
 
 /// Deep copy a decoding table, so that it can be used and free'd without
 /// impacting the source table.
-static void FSE_copy_dtable(FSE_dtable *dst, const FSE_dtable *src);
+static void FSE_copy_dtable(FSE_dtable *const dst, const FSE_dtable *const src);
 /*** END FSE PRIMITIVES ***************/
 
 /******* END IMPLEMENTATION PRIMITIVE PROTOTYPES ******************************/
@@ -291,47 +301,46 @@ typedef struct {
 /// Accepts a dict argument, which may be NULL indicating no dictionary.
 /// See
 /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame-concatenation
-static void decode_frame(io_streams_t *streams, dictionary_t *dict);
+static void decode_frame(io_streams_t *const streams,
+                         const dictionary_t *const dict);
 
 // Decode data in a compressed block
-static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
-                             size_t block_len);
+static void decompress_block(io_streams_t *const streams,
+                             frame_context_t *const ctx,
+                             const size_t block_len);
 
 // Decode the literals section of a block
-static size_t decode_literals(io_streams_t *streams, frame_context_t *ctx,
-                              u8 **literals);
+static size_t decode_literals(io_streams_t *const streams,
+                              frame_context_t *const ctx, u8 **const literals);
 
 // Decode the sequences part of a block
-static size_t decode_sequences(frame_context_t *ctx, const u8 *src,
-                               size_t src_len, sequence_command_t **sequences);
+static size_t decode_sequences(frame_context_t *const ctx, const u8 *const src,
+                               const size_t src_len,
+                               sequence_command_t **const sequences);
 
 // Execute the decoded sequences on the literals block
-static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
-                                sequence_command_t *sequences,
-                                size_t num_sequences, const u8 *literals,
-                                size_t literals_len);
+static void execute_sequences(io_streams_t *const streams,
+                              frame_context_t *const ctx,
+                              const sequence_command_t *const sequences,
+                              const size_t num_sequences,
+                              const u8 *literals,
+                              size_t literals_len);
 
 // Parse a provided dictionary blob for use in decompression
-static void parse_dictionary(dictionary_t *dict, const u8 *src, size_t src_len);
-static void free_dictionary(dictionary_t *dict);
+static void parse_dictionary(dictionary_t *const dict, const u8 *const src,
+                             const size_t src_len);
+static void free_dictionary(dictionary_t *const dict);
 /******* END ZSTD HELPER STRUCTS AND PROTOTYPES *******************************/
 
-size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
-                       size_t src_len) {
+size_t ZSTD_decompress(void *const dst, const size_t dst_len,
+                       const void *const src, const size_t src_len) {
     return ZSTD_decompress_with_dict(dst, dst_len, src, src_len, NULL, 0);
 }
 
-size_t ZSTD_decompress_usingDict(void *_ctx, void *dst, size_t dst_len,
-                                 const void *src, size_t src_len,
-                                 const void *dict, size_t dict_len) {
-    // _ctx needed to match ZSTD lib signature
-    return ZSTD_decompress_with_dict(dst, dst_len, src, src_len, dict,
-                                     dict_len);
-}
-
-size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
-                                 size_t src_len, const void *dict,
-                                 size_t dict_len) {
+size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
+                                 const void *const src, const size_t src_len,
+                                 const void *const dict,
+                                 const size_t dict_len) {
     dictionary_t parsed_dict;
     memset(&parsed_dict, 0, sizeof(dictionary_t));
     // dict_len < 8 is not a valid dictionary
@@ -351,21 +360,26 @@ size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
 
 /******* FRAME DECODING ******************************************************/
 
-static void decode_data_frame(io_streams_t *streams, dictionary_t *dict);
-static void init_frame_context(io_streams_t *streams, frame_context_t *context,
-                               dictionary_t *dict);
-static void free_frame_context(frame_context_t *context);
-static void parse_frame_header(frame_header_t *header, const u8 *src,
-                               size_t src_len);
-static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict);
+static void decode_data_frame(io_streams_t *const streams,
+                              const dictionary_t *const dict);
+static void init_frame_context(io_streams_t *const streams,
+                               frame_context_t *const context,
+                               const dictionary_t *const dict);
+static void free_frame_context(frame_context_t *const context);
+static void parse_frame_header(frame_header_t *const header,
+                               const u8 *const src, const size_t src_len);
+static void frame_context_apply_dict(frame_context_t *const ctx,
+                                     const dictionary_t *const dict);
 
-static void decompress_data(io_streams_t *streams, frame_context_t *ctx);
+static void decompress_data(io_streams_t *const streams,
+                            frame_context_t *const ctx);
 
-static void decode_frame(io_streams_t *streams, dictionary_t *dict) {
+static void decode_frame(io_streams_t *const streams,
+                         const dictionary_t *const dict) {
     if (streams->src_len < 4) {
         INP_SIZE();
     }
-    u32 magic_number = read_bits_LE(streams->src, 32, 0);
+    const u32 magic_number = read_bits_LE(streams->src, 32, 0);
 
     streams->src += 4;
     streams->src_len -= 4;
@@ -374,7 +388,7 @@ static void decode_frame(io_streams_t *streams, dictionary_t *dict) {
         if (streams->src_len < 4) {
             INP_SIZE();
         }
-        size_t frame_size = read_bits_LE(streams->src, 32, 32);
+        const size_t frame_size = read_bits_LE(streams->src, 32, 32);
 
         if (streams->src_len < 4 + frame_size) {
             INP_SIZE();
@@ -396,7 +410,8 @@ static void decode_frame(io_streams_t *streams, dictionary_t *dict) {
 /// are skippable frames.
 /// See
 /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#general-structure-of-zstandard-frame-format
-static void decode_data_frame(io_streams_t *streams, dictionary_t *dict) {
+static void decode_data_frame(io_streams_t *const streams,
+                              const dictionary_t *const dict) {
     frame_context_t ctx;
 
     // Initialize the context that needs to be carried from block to block
@@ -414,8 +429,10 @@ static void decode_data_frame(io_streams_t *streams, dictionary_t *dict) {
 
 /// Takes the information provided in the header and dictionary, and initializes
 /// the context for this frame
-static void init_frame_context(io_streams_t *streams, frame_context_t *context,
-                               dictionary_t *dict) {
+static void init_frame_context(io_streams_t *const streams,
+                               frame_context_t *const context,
+                               const dictionary_t *const dict) {
+    // Most fields in context are correct when initialized to 0
     memset(context, 0x00, sizeof(frame_context_t));
 
     // Parse data from the frame header
@@ -432,7 +449,7 @@ static void init_frame_context(io_streams_t *streams, frame_context_t *context,
     frame_context_apply_dict(context, dict);
 }
 
-static void free_frame_context(frame_context_t *context) {
+static void free_frame_context(frame_context_t *const context) {
     HUF_free_dtable(&context->literals_dtable);
 
     FSE_free_dtable(&context->ll_dtable);
@@ -442,20 +459,20 @@ static void free_frame_context(frame_context_t *context) {
     memset(context, 0, sizeof(frame_context_t));
 }
 
-static void parse_frame_header(frame_header_t *header, const u8 *src,
-                               size_t src_len) {
+static void parse_frame_header(frame_header_t *const header,
+                               const u8 *const src, const size_t src_len) {
     if (src_len < 1) {
         INP_SIZE();
     }
 
-    u8 descriptor = read_bits_LE(src, 8, 0);
+    const u8 descriptor = read_bits_LE(src, 8, 0);
 
     // decode frame header descriptor into flags
-    u8 frame_content_size_flag = descriptor >> 6;
-    u8 single_segment_flag = (descriptor >> 5) & 1;
-    u8 reserved_bit = (descriptor >> 3) & 1;
-    u8 content_checksum_flag = (descriptor >> 2) & 1;
-    u8 dictionary_id_flag = descriptor & 3;
+    const u8 frame_content_size_flag = descriptor >> 6;
+    const u8 single_segment_flag = (descriptor >> 5) & 1;
+    const u8 reserved_bit = (descriptor >> 3) & 1;
+    const u8 content_checksum_flag = (descriptor >> 2) & 1;
+    const u8 dictionary_id_flag = descriptor & 3;
 
     if (reserved_bit != 0) {
         CORRUPTION();
@@ -536,7 +553,8 @@ static void parse_frame_header(frame_header_t *header, const u8 *src,
 /// A dictionary acts as initializing values for the frame context before
 /// decompression, so we implement it by applying it's predetermined
 /// tables and content to the context before beginning decompression
-static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
+static void frame_context_apply_dict(frame_context_t *const ctx,
+                                     const dictionary_t *const dict) {
     // If the content pointer is NULL then it must be an empty dict
     if (!dict || !dict->content)
         return;
@@ -574,8 +592,8 @@ static void frame_context_apply_dict(frame_context_t *ctx, dictionary_t *dict) {
 }
 
 /// Decompress the data from a frame block by block
-static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
-
+static void decompress_data(io_streams_t *const streams,
+                            frame_context_t *const ctx) {
     int last_block = 0;
     do {
         if (streams->src_len < 3) {
@@ -583,8 +601,8 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
         }
         // Parse the block header
         last_block = streams->src[0] & 1;
-        int block_type = (streams->src[0] >> 1) & 3;
-        size_t block_len = read_bits_LE(streams->src, 21, 3);
+        const int block_type = (streams->src[0] >> 1) & 3;
+        const size_t block_len = read_bits_LE(streams->src, 21, 3);
 
         streams->src += 3;
         streams->src_len -= 3;
@@ -656,8 +674,8 @@ static void decompress_data(io_streams_t *streams, frame_context_t *ctx) {
 /******* END FRAME DECODING ***************************************************/
 
 /******* BLOCK DECOMPRESSION **************************************************/
-static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
-                             size_t block_len) {
+static void decompress_block(io_streams_t *const streams, frame_context_t *const ctx,
+                             const size_t block_len) {
     if (streams->src_len < block_len) {
         INP_SIZE();
     }
@@ -666,15 +684,15 @@ static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
 
     // Part 1: decode the literals block
     u8 *literals = NULL;
-    size_t literals_size = decode_literals(streams, ctx, &literals);
+    const size_t literals_size = decode_literals(streams, ctx, &literals);
 
     // Part 2: decode the sequences block
     if (streams->src > end_of_block) {
         INP_SIZE();
     }
-    size_t sequences_size = end_of_block - streams->src;
+    const size_t sequences_size = end_of_block - streams->src;
     sequence_command_t *sequences = NULL;
-    size_t num_sequences =
+    const size_t num_sequences =
         decode_sequences(ctx, streams->src, sequences_size, &sequences);
 
     streams->src += sequences_size;
@@ -689,18 +707,22 @@ static void decompress_block(io_streams_t *streams, frame_context_t *ctx,
 /******* END BLOCK DECOMPRESSION **********************************************/
 
 /******* LITERALS DECODING ****************************************************/
-static size_t decode_literals_simple(io_streams_t *streams, u8 **literals,
-                                     int block_type, int size_format);
-static size_t decode_literals_compressed(io_streams_t *streams,
-                                         frame_context_t *ctx, u8 **literals,
-                                         int block_type, int size_format);
+static size_t decode_literals_simple(io_streams_t *const streams,
+                                     u8 **const literals, const int block_type,
+                                     const int size_format);
+static size_t decode_literals_compressed(io_streams_t *const streams,
+                                         frame_context_t *const ctx,
+                                         u8 **const literals,
+                                         const int block_type,
+                                         const int size_format);
 static size_t decode_huf_table(const u8 *src, size_t src_len,
-                               HUF_dtable *dtable);
-static size_t fse_decode_hufweights(const u8 *src, size_t src_len, u8 *weights,
-                                    int *num_symbs, size_t compressed_size);
+                               HUF_dtable *const dtable);
+static size_t fse_decode_hufweights(const u8 *const src, const size_t src_len,
+                                    u8 *const weights, int *const num_symbs,
+                                    const size_t compressed_size);
 
-static size_t decode_literals(io_streams_t *streams, frame_context_t *ctx,
-                              u8 **literals) {
+static size_t decode_literals(io_streams_t *const streams,
+                              frame_context_t *const ctx, u8 **const literals) {
     if (streams->src_len < 1) {
         INP_SIZE();
     }
@@ -720,8 +742,9 @@ static size_t decode_literals(io_streams_t *streams, frame_context_t *ctx,
 }
 
 /// Decodes literals blocks in raw or RLE form
-static size_t decode_literals_simple(io_streams_t *streams, u8 **literals,
-                                     int block_type, int size_format) {
+static size_t decode_literals_simple(io_streams_t *const streams,
+                                     u8 **const literals, const int block_type,
+                                     const int size_format) {
     size_t size;
     switch (size_format) {
     // These cases are in the form X0
@@ -787,9 +810,11 @@ static size_t decode_literals_simple(io_streams_t *streams, u8 **literals,
 }
 
 /// Decodes Huffman compressed literals
-static size_t decode_literals_compressed(io_streams_t *streams,
-                                         frame_context_t *ctx, u8 **literals,
-                                         int block_type, int size_format) {
+static size_t decode_literals_compressed(io_streams_t *const streams,
+                                         frame_context_t *const ctx,
+                                         u8 **const literals,
+                                         const int block_type,
+                                         const int size_format) {
     size_t regenerated_size, compressed_size;
     // Only size_format=0 has 1 stream, so default to 4
     int num_streams = 4;
@@ -846,8 +871,8 @@ static size_t decode_literals_compressed(io_streams_t *streams,
         // Decode provided Huffman table
 
         HUF_free_dtable(&ctx->literals_dtable);
-        size_t size = decode_huf_table(streams->src, compressed_size,
-                                       &ctx->literals_dtable);
+        const size_t size = decode_huf_table(streams->src, compressed_size,
+                                             &ctx->literals_dtable);
         streams->src += size;
         streams->src_len -= size;
         compressed_size -= size;
@@ -873,14 +898,14 @@ static size_t decode_literals_compressed(io_streams_t *streams,
 
 // Decode the Huffman table description
 static size_t decode_huf_table(const u8 *src, size_t src_len,
-                               HUF_dtable *dtable) {
+                               HUF_dtable *const dtable) {
     if (src_len < 1) {
         INP_SIZE();
     }
 
     const u8 *const osrc = src;
 
-    u8 header = src[0];
+    const u8 header = src[0];
     u8 weights[HUF_MAX_SYMBS];
     memset(weights, 0, sizeof(weights));
 
@@ -892,13 +917,16 @@ static size_t decode_huf_table(const u8 *src, size_t src_len,
     if (header >= 128) {
         // Direct representation, read the weights out
         num_symbs = header - 127;
-        size_t bytes = (num_symbs + 1) / 2;
+        const size_t bytes = (num_symbs + 1) / 2;
 
         if (bytes > src_len) {
             INP_SIZE();
         }
 
         for (int i = 0; i < num_symbs; i++) {
+            // read_bits_LE isn't applicable here because the weights are order
+            // reversed within each byte
+            // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-header
             if (i % 2 == 0) {
                 weights[i] = src[i / 2] >> 4;
             } else {
@@ -911,7 +939,7 @@ static size_t decode_huf_table(const u8 *src, size_t src_len,
     } else {
         // The weights are FSE encoded, decode them before we can construct the
         // table
-        size_t size =
+        const size_t size =
             fse_decode_hufweights(src, src_len, weights, &num_symbs, header);
         src += size;
         src_len -= size;
@@ -922,14 +950,16 @@ static size_t decode_huf_table(const u8 *src, size_t src_len,
     return src - osrc;
 }
 
-static size_t fse_decode_hufweights(const u8 *src, size_t src_len, u8 *weights,
-                                    int *num_symbs, size_t compressed_size) {
+static size_t fse_decode_hufweights(const u8 *const src, const size_t src_len,
+                                    u8 *const weights, int *const num_symbs,
+                                    const size_t compressed_size) {
     const int MAX_ACCURACY_LOG = 7;
 
     FSE_dtable dtable;
 
     // Construct the FSE table
-    size_t read = FSE_decode_header(&dtable, src, src_len, MAX_ACCURACY_LOG);
+    const size_t read =
+            FSE_decode_header(&dtable, src, src_len, MAX_ACCURACY_LOG);
 
     if (src_len < compressed_size) {
         INP_SIZE();
@@ -1001,16 +1031,20 @@ static const u8 SEQ_MATCH_LENGTH_EXTRA_BITS[53] = {
 /// Offset decoding is simpler so we just need a maximum code value
 static const u8 SEQ_MAX_CODES[3] = {35, -1, 52};
 
-static void decompress_sequences(frame_context_t *ctx, const u8 *src,
-                                 size_t src_len, sequence_command_t *sequences,
-                                 size_t num_sequences);
-static sequence_command_t decode_sequence(sequence_state_t *state,
-                                          const u8 *src, i64 *offset);
-static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
-                               seq_part_t type, seq_mode_t mode);
+static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
+                                 size_t src_len,
+                                 sequence_command_t *const sequences,
+                                 const size_t num_sequences);
+static sequence_command_t decode_sequence(sequence_state_t *const state,
+                                          const u8 *const src,
+                                          i64 *const offset);
+static size_t decode_seq_table(const u8 *src, size_t src_len,
+                               FSE_dtable *const table, const seq_part_t type,
+                               const seq_mode_t mode);
 
-static size_t decode_sequences(frame_context_t *ctx, const u8 *src,
-                               size_t src_len, sequence_command_t **sequences) {
+static size_t decode_sequences(frame_context_t *const ctx, const u8 *src,
+                               size_t src_len,
+                               sequence_command_t **const sequences) {
     size_t num_sequences;
 
     // Decode the sequence header and allocate space for the output
@@ -1050,9 +1084,10 @@ static size_t decode_sequences(frame_context_t *ctx, const u8 *src,
 }
 
 /// Decompress the FSE encoded sequence commands
-static void decompress_sequences(frame_context_t *ctx, const u8 *src,
-                                 size_t src_len, sequence_command_t *sequences,
-                                 size_t num_sequences) {
+static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
+                                 size_t src_len,
+                                 sequence_command_t *const sequences,
+                                 const size_t num_sequences) {
     if (src_len < 1) {
         INP_SIZE();
     }
@@ -1064,21 +1099,31 @@ static void decompress_sequences(frame_context_t *ctx, const u8 *src,
         CORRUPTION();
     }
 
-    sequence_state_t state;
-    size_t read;
-    // Update the tables we have stored in the context
-    read = decode_seq_table(src, src_len, &ctx->ll_dtable, seq_literal_length,
-                            (compression_modes >> 6) & 3);
-    src += read;
-    src_len -= read;
-    read = decode_seq_table(src, src_len, &ctx->of_dtable, seq_offset,
-                            (compression_modes >> 4) & 3);
-    src += read;
-    src_len -= read;
-    read = decode_seq_table(src, src_len, &ctx->ml_dtable, seq_match_length,
-                            (compression_modes >> 2) & 3);
-    src += read;
-    src_len -= read;
+    {
+        size_t read;
+        // Update the tables we have stored in the context
+        read = decode_seq_table(src, src_len, &ctx->ll_dtable,
+                                seq_literal_length,
+                                (compression_modes >> 6) & 3);
+        src += read;
+        src_len -= read;
+    }
+
+    {
+        const size_t read =
+                decode_seq_table(src, src_len, &ctx->of_dtable, seq_offset,
+                                 (compression_modes >> 4) & 3);
+        src += read;
+        src_len -= read;
+    }
+
+    {
+        const size_t read = decode_seq_table(src, src_len, &ctx->ml_dtable,
+                                             seq_match_length,
+                                             (compression_modes >> 2) & 3);
+        src += read;
+        src_len -= read;
+    }
 
     // Check to make sure none of the tables are uninitialized
     if (!ctx->ll_dtable.symbols || !ctx->of_dtable.symbols ||
@@ -1086,12 +1131,13 @@ static void decompress_sequences(frame_context_t *ctx, const u8 *src,
         CORRUPTION();
     }
 
-    // Now use the context's tables
+    sequence_state_t state;
+    // Copy the context's tables into the local state
     memcpy(&state.ll_table, &ctx->ll_dtable, sizeof(FSE_dtable));
     memcpy(&state.of_table, &ctx->of_dtable, sizeof(FSE_dtable));
     memcpy(&state.ml_table, &ctx->ml_dtable, sizeof(FSE_dtable));
 
-    int padding = 8 - log2inf(src[src_len - 1]);
+    const int padding = 8 - log2inf(src[src_len - 1]);
     i64 offset = src_len * 8 - padding;
 
     FSE_init_state(&state.ll_table, &state.ll_state, src, &offset);
@@ -1111,12 +1157,13 @@ static void decompress_sequences(frame_context_t *ctx, const u8 *src,
 }
 
 // Decode a single sequence and update the state
-static sequence_command_t decode_sequence(sequence_state_t *state,
-                                          const u8 *src, i64 *offset) {
+static sequence_command_t decode_sequence(sequence_state_t *const state,
+                                          const u8 *const src,
+                                          i64 *const offset) {
     // Decode symbols, but don't update states
-    u8 of_code = FSE_peek_symbol(&state->of_table, state->of_state);
-    u8 ll_code = FSE_peek_symbol(&state->ll_table, state->ll_state);
-    u8 ml_code = FSE_peek_symbol(&state->ml_table, state->ml_state);
+    const u8 of_code = FSE_peek_symbol(&state->of_table, state->of_state);
+    const u8 ll_code = FSE_peek_symbol(&state->ll_table, state->ll_state);
+    const u8 ml_code = FSE_peek_symbol(&state->ml_table, state->ml_state);
 
     // Offset doesn't need a max value as it's not decoded using a table
     if (ll_code > SEQ_MAX_CODES[seq_literal_length] ||
@@ -1147,9 +1194,9 @@ static sequence_command_t decode_sequence(sequence_state_t *state,
 }
 
 /// Given a sequence part and table mode, decode the FSE distribution
-static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
-                               seq_part_t type, seq_mode_t mode) {
-
+static size_t decode_seq_table(const u8 *src, size_t src_len,
+                               FSE_dtable *const table, const seq_part_t type,
+                               const seq_mode_t mode) {
     // Constant arrays indexed by seq_part_t
     const i16 *const default_distributions[] = {SEQ_LITERAL_LENGTH_DEFAULT_DIST,
                                                 SEQ_OFFSET_DEFAULT_DIST,
@@ -1178,7 +1225,7 @@ static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
         if (src_len < 1) {
             INP_SIZE();
         }
-        u8 symb = src[0];
+        const u8 symb = src[0];
         src++;
         src_len--;
         FSE_init_dtable_rle(table, symb);
@@ -1204,15 +1251,17 @@ static size_t decode_seq_table(const u8 *src, size_t src_len, FSE_dtable *table,
 /******* END SEQUENCE DECODING ************************************************/
 
 /******* SEQUENCE EXECUTION ***************************************************/
-static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
-                                sequence_command_t *sequences,
-                                size_t num_sequences, const u8 *literals,
-                                size_t literals_len) {
-    u64 *offset_hist = ctx->previous_offsets;
+static void execute_sequences(io_streams_t *const streams,
+                              frame_context_t *const ctx,
+                              const sequence_command_t *const sequences,
+                              const size_t num_sequences,
+                              const u8 *literals,
+                              size_t literals_len) {
+    u64 *const offset_hist = ctx->previous_offsets;
     size_t total_output = ctx->current_total_output;
 
     for (size_t i = 0; i < num_sequences; i++) {
-        sequence_command_t seq = sequences[i];
+        const sequence_command_t seq = sequences[i];
 
         if (seq.literal_length > literals_len) {
             CORRUPTION();
@@ -1312,46 +1361,48 @@ static size_t execute_sequences(io_streams_t *streams, frame_context_t *ctx,
 /******* END SEQUENCE EXECUTION ***********************************************/
 
 /******* OUTPUT SIZE COUNTING *************************************************/
-size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len);
+size_t traverse_frame(const frame_header_t *const header, const u8 *src,
+                      size_t src_len);
 
 /// Get the decompressed size of an input stream so memory can be allocated in
 /// advance.
 /// This is more complex than the implementation in the reference
 /// implementation, as this API allows for the decompression of multiple
 /// concatenated frames.
-size_t ZSTD_get_decompressed_size(const void *src, size_t src_len) {
+size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) {
   const u8 *ip = (const u8 *) src;
+  size_t ip_len = src_len;
   size_t dst_size = 0;
 
   // Each frame header only gives us the size of its frame, so iterate over all
   // frames
-  while (src_len > 0) {
-    if (src_len < 4) {
+  while (ip_len > 0) {
+    if (ip_len < 4) {
       INP_SIZE();
     }
 
-    u32 magic_number = read_bits_LE(ip, 32, 0);
+    const u32 magic_number = read_bits_LE(ip, 32, 0);
 
     ip += 4;
-    src_len -= 4;
+    ip_len -= 4;
     if (magic_number >= 0x184D2A50U && magic_number <= 0x184D2A5F) {
         // skippable frame, this has no impact on output size
-        if (src_len < 4) {
+        if (ip_len < 4) {
             INP_SIZE();
         }
-        size_t frame_size = read_bits_LE(ip, 32, 32);
+        const size_t frame_size = read_bits_LE(ip, 32, 32);
 
-        if (src_len < 4 + frame_size) {
+        if (ip_len < 4 + frame_size) {
             INP_SIZE();
         }
 
         // skip over frame
         ip += 4 + frame_size;
-        src_len -= 4 + frame_size;
+        ip_len -= 4 + frame_size;
     } else if (magic_number == 0xFD2FB528U) {
         // ZSTD frame
         frame_header_t header;
-        parse_frame_header(&header, ip, src_len);
+        parse_frame_header(&header, ip, ip_len);
 
         if (header.frame_content_size == 0 && !header.single_segment_flag) {
             // Content size not provided, we can't tell
@@ -1361,9 +1412,9 @@ size_t ZSTD_get_decompressed_size(const void *src, size_t src_len) {
         dst_size += header.frame_content_size;
 
         // we need to traverse the frame to find when the next one starts
-        size_t traversed = traverse_frame(&header, ip, src_len);
+        const size_t traversed = traverse_frame(&header, ip, ip_len);
         ip += traversed;
-        src_len -= traversed;
+        ip_len -= traversed;
     } else {
         // not a real frame
         ERROR("Invalid magic number");
@@ -1375,7 +1426,8 @@ size_t ZSTD_get_decompressed_size(const void *src, size_t src_len) {
 
 /// Iterate over each block in a frame to find the end of it, to get to the
 /// start of the next frame
-size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len) {
+size_t traverse_frame(const frame_header_t *const header, const u8 *src,
+                      size_t src_len) {
     const u8 *const src_beg = src;
     const u8 *const src_end = src + src_len;
     src += header->header_size;
@@ -1389,8 +1441,8 @@ size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len) {
         }
         // Parse the block header
         last_block = src[0] & 1;
-        int block_type = (src[0] >> 1) & 3;
-        size_t block_len = read_bits_LE(src, 21, 3);
+        const int block_type = (src[0] >> 1) & 3;
+        const size_t block_len = read_bits_LE(src, 21, 3);
 
         src += 3;
         switch (block_type) {
@@ -1432,16 +1484,16 @@ size_t traverse_frame(frame_header_t *header, const u8 *src, size_t src_len) {
 /******* END OUTPUT SIZE COUNTING *********************************************/
 
 /******* DICTIONARY PARSING ***************************************************/
-static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
-                                  size_t src_len);
+static void init_raw_content_dict(dictionary_t *const dict, const u8 *const src,
+                                  const size_t src_len);
 
-static void parse_dictionary(dictionary_t *dict, const u8 *src,
+static void parse_dictionary(dictionary_t *const dict, const u8 *src,
                              size_t src_len) {
     memset(dict, 0, sizeof(dictionary_t));
     if (src_len < 8) {
         INP_SIZE();
     }
-    u32 magic_number = read_bits_LE(src, 32, 0);
+    const u32 magic_number = read_bits_LE(src, 32, 0);
     if (magic_number != 0xEC30A437) {
         // raw content dict
         init_raw_content_dict(dict, src, src_len);
@@ -1454,25 +1506,26 @@ static void parse_dictionary(dictionary_t *dict, const u8 *src,
 
     // Parse the provided entropy tables in order
     {
-        size_t read = decode_huf_table(src, src_len, &dict->literals_dtable);
+        const size_t read =
+                decode_huf_table(src, src_len, &dict->literals_dtable);
         src += read;
         src_len -= read;
     }
     {
-        size_t read = decode_seq_table(src, src_len, &dict->of_dtable,
-                                       seq_offset, seq_fse);
+        const size_t read = decode_seq_table(src, src_len, &dict->of_dtable,
+                                             seq_offset, seq_fse);
         src += read;
         src_len -= read;
     }
     {
-        size_t read = decode_seq_table(src, src_len, &dict->ml_dtable,
-                                       seq_match_length, seq_fse);
+        const size_t read = decode_seq_table(src, src_len, &dict->ml_dtable,
+                                             seq_match_length, seq_fse);
         src += read;
         src_len -= read;
     }
     {
-        size_t read = decode_seq_table(src, src_len, &dict->ll_dtable,
-                                       seq_literal_length, seq_fse);
+        const size_t read = decode_seq_table(src, src_len, &dict->ll_dtable,
+                                             seq_literal_length, seq_fse);
         src += read;
         src_len -= read;
     }
@@ -1505,8 +1558,8 @@ static void parse_dictionary(dictionary_t *dict, const u8 *src,
 }
 
 /// If parse_dictionary is given a raw content dictionary, it delegates here
-static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
-                                  size_t src_len) {
+static void init_raw_content_dict(dictionary_t *const dict, const u8 *const src,
+                                  const size_t src_len) {
     dict->dictionary_id = 0;
     // Copy in the content
     dict->content = malloc(src_len);
@@ -1519,7 +1572,7 @@ static void init_raw_content_dict(dictionary_t *dict, const u8 *src,
 }
 
 /// Free an allocated dictionary
-static void free_dictionary(dictionary_t *dict) {
+static void free_dictionary(dictionary_t *const dict) {
     HUF_free_dtable(&dict->literals_dtable);
     FSE_free_dtable(&dict->ll_dtable);
     FSE_free_dtable(&dict->of_dtable);
@@ -1531,179 +1584,50 @@ static void free_dictionary(dictionary_t *dict) {
 }
 /******* END DICTIONARY PARSING ***********************************************/
 
-/******* CIRCULAR BUFFER ******************************************************/
-static void cbuf_init(cbuf_t *buf, size_t size) {
-    buf->ptr = malloc(size);
-
-    if (!buf->ptr) {
-        BAD_ALLOC();
-    }
-
-    memset(buf->ptr, 0x3f, size);
-
-    buf->size = size;
-    buf->idx = 0;
-    buf->last_flush = 0;
-}
-
-static size_t cbuf_write_data(cbuf_t *buf, const u8 *src, size_t src_len) {
-    if (buf->size == 0 && src_len > 0) {
-        CORRUPTION();
-    }
-    size_t max_len = buf->size - buf->idx;
-    size_t len = MIN(src_len, max_len);
-
-    memcpy(buf->ptr + buf->idx, src, len);
-
-    buf->idx += len;
-
-    return len;
-}
-
-static size_t cbuf_write_data_full(cbuf_t *buf, const u8 *src, size_t src_len,
-                                   u8 *out, size_t out_len) {
-    size_t written = 0;
-    size_t flushed = 0;
-    while (1) {
-        written += cbuf_write_data(buf, src + written, src_len - written);
-        if (written == src_len) {
-            break;
-        } else {
-            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
-        }
-    }
-
-    return flushed;
-}
-
-static size_t cbuf_copy_offset(cbuf_t *buf, size_t offset, size_t len) {
-    if (buf->size == 0 && len > 0) {
-        CORRUPTION();
-    }
-    if (offset > buf->size) {
-        CORRUPTION();
-    }
-    size_t max_len = buf->size - buf->idx;
-    len = MIN(len, max_len);
-
-    size_t read_off = (buf->idx + buf->size - offset) % buf->size;
-
-    for (size_t i = 0; i < len; i++) {
-        buf->ptr[buf->idx++] = buf->ptr[read_off++];
-        if (read_off == buf->size) {
-            read_off = 0;
-        }
-    }
-
-    return len;
-}
-
-static size_t cbuf_copy_offset_full(cbuf_t *buf, size_t offset, size_t len,
-                                    u8 *out, size_t out_len) {
-    size_t written = 0;
-    size_t flushed = 0;
-    while (1) {
-        written += cbuf_copy_offset(buf, offset, len - written);
-        if (written == len) {
-            break;
-        } else {
-            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
-        }
-    }
-
-    return flushed;
-}
-
-static size_t cbuf_repeat_byte(cbuf_t *buf, u8 byte, size_t len) {
-    if (buf->size == 0 && len > 0) {
-        CORRUPTION();
-    }
-    size_t max_len = buf->size - buf->idx;
-    len = MIN(len, max_len);
-
-    memset(buf->ptr + buf->idx, byte, len);
-
-    return len;
-}
-
-static size_t cbuf_repeat_byte_full(cbuf_t *buf, u8 byte, size_t len, u8 *out,
-                                    size_t out_len) {
-    size_t written = 0;
-    size_t flushed = 0;
-    while (1) {
-        written += cbuf_repeat_byte(buf, byte, len - written);
-        if (written == len) {
-            break;
-        } else {
-            flushed += cbuf_flush(buf, out + flushed, out_len - flushed);
-        }
-    }
-
-    return flushed;
-}
-
-static size_t cbuf_flush(cbuf_t *buf, u8 *dst, size_t dst_len) {
-    if (buf->idx < buf->last_flush) {
-        CORRUPTION();
-    }
-
-    size_t len = buf->idx - buf->last_flush;
-
-    if (dst && len > dst_len) {
-        OUT_SIZE();
-    }
-
-    // allow for NULL buffers to indicate flushing to nowhere
-    if (dst) {
-        memcpy(dst, buf->ptr + buf->last_flush, len);
-    }
-
-    // we could have a 0 size buffer
-    if (buf->size) {
-        buf->idx = buf->idx % buf->size;
-    }
-    buf->last_flush = buf->idx;
-
-    return len;
-}
-
-static void cbuf_free(cbuf_t *buf) {
-    free(buf->ptr);
-    memset(buf, 0, sizeof(cbuf_t));
-}
-/******* END CIRCULAR BUFFER **************************************************/
-
 /******* BITSTREAM OPERATIONS *************************************************/
-static inline u64 read_bits_LE(const u8 *src, int num, size_t offset) {
+/// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
+static inline u64 read_bits_LE(const u8 *src, const int num,
+                               const size_t offset) {
     if (num > 64) {
         return -1;
     }
 
+    // Skip over bytes that aren't in range
     src += offset / 8;
-    offset %= 8;
+    size_t bit_offset = offset % 8;
     u64 res = 0;
 
     int shift = 0;
     int left = num;
     while (left > 0) {
         u64 mask = left >= 8 ? 0xff : (((u64)1 << left) - 1);
-        res += (((u64)*src++ >> offset) & mask) << shift;
-        shift += 8 - offset;
-        left -= 8 - offset;
-        offset = 0;
+        // Dead the next byte, shift it to account for the offset, and then mask
+        // out the top part if we don't need all the bits
+        res += (((u64)*src++ >> bit_offset) & mask) << shift;
+        shift += 8 - bit_offset;
+        left -= 8 - bit_offset;
+        bit_offset = 0;
     }
 
     return res;
 }
 
-static inline u64 STREAM_read_bits(const u8 *src, int bits, i64 *offset) {
+/// Read bits from the end of a HUF or FSE bitstream.  `offset` is in bits, so
+/// it updates `offset` to `offset - bits`, and then reads `bits` bits from
+/// `src + offset`.  If the offset becomes negative, the extra bits at the
+/// bottom are filled in with `0` bits instead of reading from before `src`.
+static inline u64 STREAM_read_bits(const u8 *const src, const int bits,
+                                   i64 *const offset) {
     *offset = *offset - bits;
     size_t actual_off = *offset;
+    size_t actual_bits = bits;
+    // Don't actually read bits from before the start of src, so if `*offset <
+    // 0` fix actual_off and actual_bits to reflect the quantity to read
     if (*offset < 0) {
-        bits += *offset;
+        actual_bits += *offset;
         actual_off = 0;
     }
-    u64 res = read_bits_LE(src, bits, actual_off);
+    u64 res = read_bits_LE(src, actual_bits, actual_off);
 
     if (*offset < 0) {
         // Fill in the bottom "overflowed" bits with 0's
@@ -1714,16 +1638,9 @@ static inline u64 STREAM_read_bits(const u8 *src, int bits, i64 *offset) {
 /******* END BITSTREAM OPERATIONS *********************************************/
 
 /******* BIT COUNTING OPERATIONS **********************************************/
-static inline int log2sup(u64 num) {
-    for (int i = 0; i < 64; i++) {
-        if (((u64)1 << i) >= num) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-static inline int log2inf(u64 num) {
+/// Returns `x`, where `2^x` is the largest power of 2 less than or equal to
+/// `num`, or `-1` if `num == 0`.
+static inline int log2inf(const u64 num) {
     for (int i = 63; i >= 0; i--) {
         if (((u64)1 << i) <= num) {
             return i;
@@ -1734,33 +1651,38 @@ static inline int log2inf(u64 num) {
 /******* END BIT COUNTING OPERATIONS ******************************************/
 
 /******* HUFFMAN PRIMITIVES ***************************************************/
-static inline u8 HUF_decode_symbol(HUF_dtable *dtable, u16 *state,
-                                   const u8 *src, i64 *offset) {
+static inline u8 HUF_decode_symbol(const HUF_dtable *const dtable,
+                                   u16 *const state, const u8 *const src,
+                                   i64 *const offset) {
     // Look up the symbol and number of bits to read
     const u8 symb = dtable->symbols[*state];
     const u8 bits = dtable->num_bits[*state];
     const u16 rest = STREAM_read_bits(src, bits, offset);
+    // Shift `bits` bits out of the state, keeping the low order bits that
+    // weren't necessary to determine this symbol.  Then add in the new bits
+    // read from the stream.
     *state = ((*state << bits) + rest) & (((u16)1 << dtable->max_bits) - 1);
 
     return symb;
 }
 
-static inline void HUF_init_state(HUF_dtable *dtable, u16 *state, const u8 *src,
-                                  i64 *offset) {
-    // Read in a full dtable->max_bits to initialize the state
+static inline void HUF_init_state(const HUF_dtable *const dtable,
+                                  u16 *const state, const u8 *const src,
+                                  i64 *const offset) {
+    // Read in a full `dtable->max_bits` bits to initialize the state
     const u8 bits = dtable->max_bits;
     *state = STREAM_read_bits(src, bits, offset);
 }
 
-static size_t HUF_decompress_1stream(HUF_dtable *dtable, u8 *dst,
-                                     size_t dst_len, const u8 *src,
+static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, u8 *dst,
+                                     const size_t dst_len, const u8 *src,
                                      size_t src_len) {
-    u8 *const dst_max = dst + dst_len;
-    u8 *const odst = dst;
+    const u8 *const dst_max = dst + dst_len;
+    const u8 *const odst = dst;
 
     // To maintain similarity with FSE, start from the end
     // Find the last 1 bit
-    int padding = 8 - log2inf(src[src_len - 1]);
+    const int padding = 8 - log2inf(src[src_len - 1]);
 
     i64 offset = src_len * 8 - padding;
     u16 state;
@@ -1768,6 +1690,7 @@ static size_t HUF_decompress_1stream(HUF_dtable *dtable, u8 *dst,
     HUF_init_state(dtable, &state, src, &offset);
 
     while (dst < dst_max && offset > -dtable->max_bits) {
+        // Iterate over the stream, decoding one symbol at a time
         *dst++ = HUF_decode_symbol(dtable, &state, src, &offset);
     }
     // If we stopped before consuming all the input, we didn't have enough space
@@ -1775,8 +1698,11 @@ static size_t HUF_decompress_1stream(HUF_dtable *dtable, u8 *dst,
         OUT_SIZE();
     }
 
-    // The current state should be the `max_bits` preceding the start as
-    // everything from `src` onward should be consumed
+    // When all symbols have been decoded, the final state value shouldn't have
+    // any data from the stream, so it should have "read" dtable->max_bits from
+    // before the start of `src`
+    // Therefore `offset`, the edge to start reading new bits at, should be
+    // dtable->max_bits before the start of the stream
     if (offset != -dtable->max_bits) {
         CORRUPTION();
     }
@@ -1784,28 +1710,18 @@ static size_t HUF_decompress_1stream(HUF_dtable *dtable, u8 *dst,
     return dst - odst;
 }
 
-static size_t HUF_decompress_4stream(HUF_dtable *dtable, u8 *dst,
-                                     size_t dst_len, const u8 *src,
-                                     size_t src_len) {
-    // Decode each stream independently for simplicity
-    // If we wanted to we could decode all 4 at the same time for speed,
-    // utilizing
-    // more execution units
-
-    const u8 *src1, *src2, *src3, *src4, *src_end;
-    u8 *dst1, *dst2, *dst3, *dst4, *dst_end;
-
-    size_t total_out = 0;
-
+static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, u8 *dst,
+                                     const size_t dst_len, const u8 *const src,
+                                     const size_t src_len) {
     if (src_len < 6) {
         INP_SIZE();
     }
 
-    src1 = src + 6;
-    src2 = src1 + read_bits_LE(src, 16, 0);
-    src3 = src2 + read_bits_LE(src, 16, 16);
-    src4 = src3 + read_bits_LE(src, 16, 32);
-    src_end = src + src_len;
+    const u8 *const src1 = src + 6;
+    const u8 *const src2 = src1 + read_bits_LE(src, 16, 0);
+    const u8 *const src3 = src2 + read_bits_LE(src, 16, 16);
+    const u8 *const src4 = src3 + read_bits_LE(src, 16, 32);
+    const u8 *const src_end = src + src_len;
 
     // We can't test with all 4 sizes because the 4th size is a function of the
     // other 3 and the provided length
@@ -1813,26 +1729,32 @@ static size_t HUF_decompress_4stream(HUF_dtable *dtable, u8 *dst,
         INP_SIZE();
     }
 
-    size_t segment_size = (dst_len + 3) / 4;
-    dst1 = dst;
-    dst2 = dst1 + segment_size;
-    dst3 = dst2 + segment_size;
-    dst4 = dst3 + segment_size;
-    dst_end = dst + dst_len;
+    const size_t segment_size = (dst_len + 3) / 4;
+    u8 *const dst1 = dst;
+    u8 *const dst2 = dst1 + segment_size;
+    u8 *const dst3 = dst2 + segment_size;
+    u8 *const dst4 = dst3 + segment_size;
+    u8 *const dst_end = dst + dst_len;
 
-    total_out +=
-        HUF_decompress_1stream(dtable, dst1, segment_size, src1, src2 - src1);
-    total_out +=
-        HUF_decompress_1stream(dtable, dst2, segment_size, src2, src3 - src2);
-    total_out +=
-        HUF_decompress_1stream(dtable, dst3, segment_size, src3, src4 - src3);
+    size_t total_out = 0;
+
+    // Decode each stream independently for simplicity
+    // If we wanted to we could decode all 4 at the same time for speed,
+    // utilizing more execution units
+    total_out += HUF_decompress_1stream(dtable, dst1, segment_size, src1,
+                                        src2 - src1);
+    total_out += HUF_decompress_1stream(dtable, dst2, segment_size, src2,
+                                        src3 - src2);
+    total_out += HUF_decompress_1stream(dtable, dst3, segment_size, src3,
+                                        src4 - src3);
     total_out += HUF_decompress_1stream(dtable, dst4, dst_end - dst4, src4,
                                         src_end - src4);
 
     return total_out;
 }
 
-static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs) {
+static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
+                            const int num_symbs) {
     memset(table, 0, sizeof(HUF_dtable));
     if (num_symbs > HUF_MAX_SYMBS) {
         ERROR("Too many symbols for Huffman");
@@ -1852,7 +1774,7 @@ static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs) {
         rank_count[bits[i]]++;
     }
 
-    size_t table_size = 1 << max_bits;
+    const size_t table_size = 1 << max_bits;
     table->max_bits = max_bits;
     table->symbols = malloc(table_size);
     table->num_bits = malloc(table_size);
@@ -1881,6 +1803,9 @@ static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs) {
         if (bits[i] != 0) {
             // Allocate a code for this symbol and set its range in the table
             const u16 code = rank_idx[bits[i]];
+            // Since the code doesn't care about the bottom `max_bits - bits[i]`
+            // bits of state, it gets a range that spans all possible values of
+            // the lower bits
             const u16 len = 1 << (max_bits - bits[i]);
             memset(&table->symbols[code], i, len);
             rank_idx[bits[i]] += len;
@@ -1888,8 +1813,9 @@ static void HUF_init_dtable(HUF_dtable *table, u8 *bits, int num_symbs) {
     }
 }
 
-static void HUF_init_dtable_usingweights(HUF_dtable *table, u8 *weights,
-                                         int num_symbs) {
+static void HUF_init_dtable_usingweights(HUF_dtable *const table,
+                                         const u8 *const weights,
+                                         const int num_symbs) {
     // +1 because the last weight is not transmitted in the header
     if (num_symbs + 1 > HUF_MAX_SYMBS) {
         ERROR("Too many symbols for Huffman");
@@ -1903,37 +1829,40 @@ static void HUF_init_dtable_usingweights(HUF_dtable *table, u8 *weights,
     }
 
     // Find the first power of 2 larger than the sum
-    int max_bits = log2inf(weight_sum) + 1;
-    u64 left_over = ((u64)1 << max_bits) - weight_sum;
+    const int max_bits = log2inf(weight_sum) + 1;
+    const u64 left_over = ((u64)1 << max_bits) - weight_sum;
     // If the left over isn't a power of 2, the weights are invalid
     if (left_over & (left_over - 1)) {
         CORRUPTION();
     }
 
-    int last_weight = log2inf(left_over) + 1;
+    // left_over is used to find the last weight as it's not transmitted
+    // by inverting 2^(weight - 1) we can determine the value of last_weight
+    const int last_weight = log2inf(left_over) + 1;
 
     for (int i = 0; i < num_symbs; i++) {
         bits[i] = weights[i] > 0 ? (max_bits + 1 - weights[i]) : 0;
     }
     bits[num_symbs] =
-        max_bits + 1 - last_weight; // last weight is always non-zero
+        max_bits + 1 - last_weight; // Last weight is always non-zero
 
     HUF_init_dtable(table, bits, num_symbs + 1);
 }
 
-static void HUF_free_dtable(HUF_dtable *dtable) {
+static void HUF_free_dtable(HUF_dtable *const dtable) {
     free(dtable->symbols);
     free(dtable->num_bits);
     memset(dtable, 0, sizeof(HUF_dtable));
 }
 
-static void HUF_copy_dtable(HUF_dtable *dst, const HUF_dtable *src) {
+static void HUF_copy_dtable(HUF_dtable *const dst,
+                            const HUF_dtable *const src) {
     if (src->max_bits == 0) {
         memset(dst, 0, sizeof(HUF_dtable));
         return;
     }
 
-    size_t size = (size_t)1 << src->max_bits;
+    const size_t size = (size_t)1 << src->max_bits;
     dst->max_bits = src->max_bits;
 
     dst->symbols = malloc(size);
@@ -1948,46 +1877,56 @@ static void HUF_copy_dtable(HUF_dtable *dst, const HUF_dtable *src) {
 /******* END HUFFMAN PRIMITIVES ***********************************************/
 
 /******* FSE PRIMITIVES *******************************************************/
-static inline u8 FSE_peek_symbol(FSE_dtable *dtable, u16 state) {
+/// Allow a symbol to be decoded without updating state
+static inline u8 FSE_peek_symbol(const FSE_dtable *const dtable,
+                                 const u16 state) {
     return dtable->symbols[state];
 }
 
-static inline void FSE_update_state(FSE_dtable *dtable, u16 *state,
-                                    const u8 *src, i64 *offset) {
+/// Consumes bits from the input and uses the current state to determine the
+/// next state
+static inline void FSE_update_state(const FSE_dtable *const dtable,
+                                    u16 *const state, const u8 *const src,
+                                    i64 *const offset) {
     const u8 bits = dtable->num_bits[*state];
     const u16 rest = STREAM_read_bits(src, bits, offset);
     *state = dtable->new_state_base[*state] + rest;
 }
 
-// Decodes a single FSE symbol and updates the offset
-static inline u8 FSE_decode_symbol(FSE_dtable *dtable, u16 *state,
-                                   const u8 *src, i64 *offset) {
+/// Decodes a single FSE symbol and updates the offset
+static inline u8 FSE_decode_symbol(const FSE_dtable *const dtable,
+                                   u16 *const state, const u8 *const src,
+                                   i64 *const offset) {
     const u8 symb = FSE_peek_symbol(dtable, *state);
     FSE_update_state(dtable, state, src, offset);
     return symb;
 }
 
-static inline void FSE_init_state(FSE_dtable *dtable, u16 *state, const u8 *src,
-                                  i64 *offset) {
+static inline void FSE_init_state(const FSE_dtable *const dtable,
+                                  u16 *const state, const u8 *const src,
+                                  i64 *const offset) {
+    // Read in a full `accuracy_log` bits to initialize the state
     const u8 bits = dtable->accuracy_log;
     *state = STREAM_read_bits(src, bits, offset);
 }
 
-static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
-                                          size_t dst_len, const u8 *src,
-                                          size_t src_len) {
+static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
+                                          u8 *dst, const size_t dst_len,
+                                          const u8 *const src,
+                                          const size_t src_len) {
     if (src_len == 0) {
         INP_SIZE();
     }
 
-    u8 *dst_max = dst + dst_len;
-    u8 *const odst = dst;
+    const u8 *const dst_max = dst + dst_len;
+    const u8 *const odst = dst;
 
     // Find the last 1 bit
-    int padding = 8 - log2inf(src[src_len - 1]);
+    const int padding = 8 - log2inf(src[src_len - 1]);
 
     i64 offset = src_len * 8 - padding;
 
+    // The end of the stream contains the 2 states, in this order
     u16 state1, state2;
     FSE_init_state(dtable, &state1, src, &offset);
     FSE_init_state(dtable, &state2, src, &offset);
@@ -2002,7 +1941,7 @@ static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
         *dst++ = FSE_decode_symbol(dtable, &state1, src, &offset);
         if (offset < 0) {
             // There's still a symbol to decode in state2
-            *dst++ = FSE_decode_symbol(dtable, &state2, src, &offset);
+            *dst++ = FSE_peek_symbol(dtable, state2);
             break;
         }
 
@@ -2012,17 +1951,18 @@ static size_t FSE_decompress_interleaved2(FSE_dtable *dtable, u8 *dst,
         *dst++ = FSE_decode_symbol(dtable, &state2, src, &offset);
         if (offset < 0) {
             // There's still a symbol to decode in state1
-            *dst++ = FSE_decode_symbol(dtable, &state1, src, &offset);
+            *dst++ = FSE_peek_symbol(dtable, state1);
             break;
         }
     }
 
-    // number of symbols read
+    // Number of symbols read
     return dst - odst;
 }
 
-static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
-                            int num_symbs, int accuracy_log) {
+static void FSE_init_dtable(FSE_dtable *const dtable,
+                            const i16 *const norm_freqs, const int num_symbs,
+                            const int accuracy_log) {
     if (accuracy_log > FSE_MAX_ACCURACY_LOG) {
         ERROR("FSE accuracy too large");
     }
@@ -2032,7 +1972,7 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
 
     dtable->accuracy_log = accuracy_log;
 
-    size_t size = (size_t)1 << accuracy_log;
+    const size_t size = (size_t)1 << accuracy_log;
     dtable->symbols = malloc(size * sizeof(u8));
     dtable->num_bits = malloc(size * sizeof(u8));
     dtable->new_state_base = malloc(size * sizeof(u16));
@@ -2057,8 +1997,8 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
     }
 
     // Place the rest in the table
-    u16 step = (size >> 1) + (size >> 3) + 3;
-    u16 mask = size - 1;
+    const u16 step = (size >> 1) + (size >> 3) + 3;
+    const u16 mask = size - 1;
     u16 pos = 0;
     for (int s = 0; s < num_symbs; s++) {
         if (norm_freqs[s] <= 0) {
@@ -2068,6 +2008,7 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
         state_desc[s] = norm_freqs[s];
 
         for (int i = 0; i < norm_freqs[s]; i++) {
+            // Give `norm_freqs[s]` states to symbol s
             dtable->symbols[pos] = s;
             do {
                 pos = (pos + step) & mask;
@@ -2087,18 +2028,21 @@ static void FSE_init_dtable(FSE_dtable *dtable, const i16 *norm_freqs,
     for (int i = 0; i < size; i++) {
         u8 symbol = dtable->symbols[i];
         u16 next_state_desc = state_desc[symbol]++;
-        // Fills in the table appropriately next_state_desc increases by symbol
+        // Fills in the table appropriately, next_state_desc increases by symbol
         // over time, decreasing number of bits
         dtable->num_bits[i] = (u8)(accuracy_log - log2inf(next_state_desc));
-        // baseline increases until the bit threshold is passed, at which point
+        // Baseline increases until the bit threshold is passed, at which point
         // it resets to 0
         dtable->new_state_base[i] =
             ((u16)next_state_desc << dtable->num_bits[i]) - size;
     }
 }
 
-static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
-                                size_t src_len, int max_accuracy_log) {
+/// Decode an FSE header as defined in the Zstandard format specification and
+/// use the decoded frequencies to initialize a decoding table.
+static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
+                                const size_t src_len,
+                                const int max_accuracy_log) {
     if (max_accuracy_log > FSE_MAX_ACCURACY_LOG) {
         ERROR("FSE accuracy too large");
     }
@@ -2106,7 +2050,7 @@ static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
         INP_SIZE();
     }
 
-    int accuracy_log = 5 + read_bits_LE(src, 4, 0);
+    const int accuracy_log = 5 + read_bits_LE(src, 4, 0);
     if (accuracy_log > max_accuracy_log) {
         ERROR("FSE accuracy too large");
     }
@@ -2116,17 +2060,19 @@ static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
     i16 frequencies[FSE_MAX_SYMBS];
 
     int symb = 0;
+    // Offset of 4 because 4 bits were already read in for accuracy
     size_t offset = 4;
     while (remaining > 1 && symb < FSE_MAX_SYMBS) {
-        int bits = log2sup(remaining +
-                           1); // the number of possible values we could read
+        // Log of the number of possible values we could read
+        int bits = log2inf(remaining) + 1;
+
         u16 val = read_bits_LE(src, bits, offset);
         offset += bits;
 
-        // try to mask out the lower bits to see if it qualifies for the "small
+        // Try to mask out the lower bits to see if it qualifies for the "small
         // value" threshold
-        u16 lower_mask = ((u16)1 << (bits - 1)) - 1;
-        u16 threshold = ((u16)1 << bits) - 1 - remaining;
+        const u16 lower_mask = ((u16)1 << (bits - 1)) - 1;
+        const u16 threshold = ((u16)1 << bits) - 1 - remaining;
 
         if ((val & lower_mask) < threshold) {
             offset--;
@@ -2135,8 +2081,8 @@ static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
             val = val - threshold;
         }
 
-        i16 proba = (i16)val - 1;
-        // a value of -1 is possible, and has special meaning
+        const i16 proba = (i16)val - 1;
+        // A value of -1 is possible, and has special meaning
         remaining -= proba < 0 ? -proba : proba;
 
         frequencies[symb] = proba;
@@ -2144,7 +2090,7 @@ static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
 
         // Handle the special probability = 0 case
         if (proba == 0) {
-            // read the next two bits to see how many more 0s
+            // Read the next two bits to see how many more 0s
             int repeat = read_bits_LE(src, 2, offset);
             offset += 2;
 
@@ -2172,7 +2118,7 @@ static size_t FSE_decode_header(FSE_dtable *dtable, const u8 *src,
     return (offset + 7) / 8;
 }
 
-static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb) {
+static void FSE_init_dtable_rle(FSE_dtable *const dtable, const u8 symb) {
     dtable->symbols = malloc(sizeof(u8));
     dtable->num_bits = malloc(sizeof(u8));
     dtable->new_state_base = malloc(sizeof(u16));
@@ -2189,14 +2135,14 @@ static void FSE_init_dtable_rle(FSE_dtable *dtable, u8 symb) {
     dtable->accuracy_log = 0;
 }
 
-static void FSE_free_dtable(FSE_dtable *dtable) {
+static void FSE_free_dtable(FSE_dtable *const dtable) {
     free(dtable->symbols);
     free(dtable->num_bits);
     free(dtable->new_state_base);
     memset(dtable, 0, sizeof(FSE_dtable));
 }
 
-static void FSE_copy_dtable(FSE_dtable *dst, const FSE_dtable *src) {
+static void FSE_copy_dtable(FSE_dtable *const dst, const FSE_dtable *const src) {
     if (src->accuracy_log == 0) {
         memset(dst, 0, sizeof(FSE_dtable));
         return;
diff --git a/contrib/educational_decoder/zstd_decompress.h b/contrib/educational_decoder/zstd_decompress.h
index 6e173672..16f4da3e 100644
--- a/contrib/educational_decoder/zstd_decompress.h
+++ b/contrib/educational_decoder/zstd_decompress.h
@@ -7,10 +7,10 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
-size_t ZSTD_decompress(void *dst, size_t dst_len, const void *src,
-                       size_t src_len);
-size_t ZSTD_decompress_with_dict(void *dst, size_t dst_len, const void *src,
-                                 size_t src_len, const void *dict,
-                                 size_t dict_len);
-size_t ZSTD_get_decompressed_size(const void *src, size_t src_len);
+size_t ZSTD_decompress(void *const dst, const size_t dst_len,
+                       const void *const src, const size_t src_len);
+size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
+                                 const void *const src, const size_t src_len,
+                                 const void *const dict, const size_t dict_len);
+size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
 

From 823d8c233bd7a12aefee349d7556855ea3894487 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Wed, 1 Feb 2017 10:41:04 -0800
Subject: [PATCH 06/15] Minor security fixes

---
 contrib/educational_decoder/harness.c         | 2 +-
 contrib/educational_decoder/zstd_decompress.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
index 107a16a2..cff8239d 100644
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@@ -98,7 +98,7 @@ int main(int argc, char **argv) {
     }
 
     size_t decompressed =
-        ZSTD_decompress_with_dict(output, input_size * MAX_COMPRESSION_RATIO,
+        ZSTD_decompress_with_dict(output, decompressed_size,
                                   input, input_size, dict, dict_size);
 
     write_file(argv[2], output, decompressed);
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 3c1c5673..e2fbcf2c 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -1331,6 +1331,8 @@ static void execute_sequences(io_streams_t *const streams,
                 }
                 match_length -= dict_copy;
             }
+        } else if (offset > ctx->header.window_size) {
+            CORRUPTION();
         }
 
         // We must copy byte by byte because the match length might be larger

From 18ce8b54ddeb7cd80de8978d7fb0b66b966089d7 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Wed, 1 Feb 2017 17:05:45 -0800
Subject: [PATCH 07/15] Switch IO to go through streams

---
 contrib/educational_decoder/harness.c         |   15 +-
 contrib/educational_decoder/zstd_decompress.c | 1263 ++++++++---------
 2 files changed, 604 insertions(+), 674 deletions(-)

diff --git a/contrib/educational_decoder/harness.c b/contrib/educational_decoder/harness.c
index cff8239d..683278df 100644
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@@ -18,6 +18,9 @@ typedef unsigned char u8;
 // compression ratio is at most 16
 #define MAX_COMPRESSION_RATIO (16)
 
+// Protect against allocating too much memory for output
+#define MAX_OUTPUT_SIZE ((size_t)1024 * 1024 * 1024)
+
 u8 *input;
 u8 *output;
 u8 *dict;
@@ -86,11 +89,17 @@ int main(int argc, char **argv) {
     size_t decompressed_size = ZSTD_get_decompressed_size(input, input_size);
     if (decompressed_size == -1) {
         decompressed_size = MAX_COMPRESSION_RATIO * input_size;
-        fprintf(stderr, "WARNING: Compressed data does contain decompressed "
-                        "size, going to assume the compression ratio is at "
-                        "most %d (decompressed size of at most %zu)\n",
+        fprintf(stderr, "WARNING: Compressed data does not contain "
+                        "decompressed size, going to assume the compression "
+                        "ratio is at most %d (decompressed size of at most "
+                        "%zu)\n",
                 MAX_COMPRESSION_RATIO, decompressed_size);
     }
+    if (decompressed_size > MAX_OUTPUT_SIZE) {
+        fprintf(stderr,
+                "Required output size too large for this implementation\n");
+        return 1;
+    }
     output = malloc(decompressed_size);
     if (!output) {
         fprintf(stderr, "failed to allocate memory\n");
diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index e2fbcf2c..8f28313e 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -48,6 +48,7 @@ size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
 #define OUT_SIZE() ERROR("Output buffer too small for output")
 #define CORRUPTION() ERROR("Corruption detected while decompressing")
 #define BAD_ALLOC() ERROR("Memory allocation error")
+#define IMPOSSIBLE() ERROR("An impossibility has occurred")
 
 typedef uint8_t u8;
 typedef uint16_t u16;
@@ -65,6 +66,62 @@ typedef int64_t i64;
 /// file.  They implement low-level functionality needed for the higher level
 /// decompression functions.
 
+/*** IO STREAM OPERATIONS *************/
+/// These structs are the interface for IO, and do bounds checking on all
+/// operations.  They should be used opaquely to ensure safety.
+
+/// Output is always done byte-by-byte
+typedef struct {
+    u8 *ptr;
+    size_t len;
+} ostream_t;
+
+/// Input often reads a few bits at a time, so maintain an internal offset
+typedef struct {
+    const u8 *ptr;
+    int bit_offset;
+    size_t len;
+} istream_t;
+
+/// The following two functions are the only ones that allow the istream to be
+/// non-byte aligned
+
+/// Reads `num` bits from a bitstream, and updates the internal offset
+static inline u64 IO_read_bits(istream_t *const in, const int num);
+/// Rewinds the stream by `num` bits
+static inline void IO_rewind_bits(istream_t *const in, const int num);
+/// If the remaining bits in a byte will be unused, advance to the end of the
+/// byte
+static inline void IO_align_stream(istream_t *const in);
+
+/// Write the given byte into the output stream
+static inline void IO_write_byte(ostream_t *const out, u8 symb);
+
+/// Returns the number of bytes left to be read in this stream.  The stream must
+/// be byte aligned.
+static inline size_t IO_istream_len(const istream_t *const in);
+
+/// Returns a pointer where `len` bytes can be read, and advances the internal
+/// state.  The stream must be byte aligned.
+static inline const u8 *IO_read_bytes(istream_t *const in, size_t len);
+/// Returns a pointer where `len` bytes can be written, and advances the internal
+/// state.  The stream must be byte aligned.
+static inline u8 *IO_write_bytes(ostream_t *const out, size_t len);
+
+/// Advance the inner state by `len` bytes.  The stream must be byte aligned.
+static inline void IO_advance_input(istream_t *const in, size_t len);
+
+/// Returns an `ostream_t` constructed from the given pointer and length
+static inline ostream_t IO_make_ostream(u8 *out, size_t len);
+/// Returns an `istream_t` constructed from the given pointer and length
+static inline istream_t IO_make_istream(const u8 *in, size_t len);
+
+/// Returns an `istream_t` with the same base as `in`, and length `len`
+/// Then, advance `in` to account for the consumed bytes
+/// `in` must be byte aligned
+static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len);
+/*** END IO STREAM OPERATIONS *********/
+
 /*** BITSTREAM OPERATIONS *************/
 /// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
 static inline u64 read_bits_LE(const u8 *src, const int num,
@@ -109,15 +166,13 @@ static inline void HUF_init_state(const HUF_dtable *const dtable,
 
 /// Decompresses a single Huffman stream, returns the number of bytes decoded.
 /// `src_len` must be the exact length of the Huffman-coded block.
-static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, u8 *dst,
-                                     const size_t dst_len, const u8 *src,
-                                     size_t src_len);
+static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
+                                     ostream_t *const out, istream_t *const in);
 /// Same as previous but decodes 4 streams, formatted as in the Zstandard
 /// specification.
 /// `src_len` must be the exact length of the Huffman-coded block.
-static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, u8 *dst,
-                                     const size_t dst_len, const u8 *const src,
-                                     const size_t src_len);
+static size_t HUF_decompress_4stream(const HUF_dtable *const dtable,
+                                     ostream_t *const out, istream_t *const in);
 
 /// Initialize a Huffman decoding table using the table of bit counts provided
 static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
@@ -176,9 +231,8 @@ static inline void FSE_init_state(const FSE_dtable *const dtable,
 /// using an FSE decoding table.  `src_len` must be the exact length of the
 /// block.
 static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
-                                          u8 *dst, const size_t dst_len,
-                                          const u8 *const src,
-                                          const size_t src_len);
+                                          ostream_t *const out,
+                                          istream_t *const in);
 
 /// Initialize a decoding table using normalized frequencies.
 static void FSE_init_dtable(FSE_dtable *const dtable,
@@ -187,8 +241,7 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
 
 /// Decode an FSE header as defined in the Zstandard format specification and
 /// use the decoded frequencies to initialize a decoding table.
-static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
-                                const size_t src_len,
+static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
                                 const int max_accuracy_log);
 
 /// Initialize an FSE table that will always return the same symbol and consume
@@ -207,16 +260,6 @@ static void FSE_copy_dtable(FSE_dtable *const dst, const FSE_dtable *const src);
 
 /******* ZSTD HELPER STRUCTS AND PROTOTYPES ***********************************/
 
-/// Input and output pointers to allow them to be advanced by
-/// functions that consume input/produce output
-typedef struct {
-    u8 *dst;
-    size_t dst_len;
-
-    const u8 *src;
-    size_t src_len;
-} io_streams_t;
-
 /// A small structure that can be reused in various places that need to access
 /// frame header information
 typedef struct {
@@ -233,9 +276,6 @@ typedef struct {
     int content_checksum_flag;
     // Whether or not the output for this frame is in a single segment
     int single_segment_flag;
-
-    // The size in bytes of this header
-    int header_size;
 } frame_header_t;
 
 /// The context needed to decode blocks in a frame
@@ -256,9 +296,8 @@ typedef struct {
     FSE_dtable ml_dtable;
     FSE_dtable of_dtable;
 
-    // The last 3 offsets for the special "repeat offsets".  Array size is 4 so
-    // that previous_offsets[1] corresponds to the most recent offset
-    u64 previous_offsets[4];
+    // The last 3 offsets for the special "repeat offsets".
+    u64 previous_offsets[3];
 } frame_context_t;
 
 /// The decoded contents of a dictionary so that it doesn't have to be repeated
@@ -275,7 +314,7 @@ typedef struct {
     size_t content_size;
 
     // Offset history to prepopulate the frame's history
-    u64 previous_offsets[4];
+    u64 previous_offsets[3];
 
     u32 dictionary_id;
 } dictionary_t;
@@ -301,34 +340,31 @@ typedef struct {
 /// Accepts a dict argument, which may be NULL indicating no dictionary.
 /// See
 /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame-concatenation
-static void decode_frame(io_streams_t *const streams,
+static void decode_frame(ostream_t *const out, istream_t *const in,
                          const dictionary_t *const dict);
 
 // Decode data in a compressed block
-static void decompress_block(io_streams_t *const streams,
-                             frame_context_t *const ctx,
-                             const size_t block_len);
+static void decompress_block(frame_context_t *const ctx, ostream_t *const out,
+                             istream_t *const in);
 
 // Decode the literals section of a block
-static size_t decode_literals(io_streams_t *const streams,
-                              frame_context_t *const ctx, u8 **const literals);
+static size_t decode_literals(frame_context_t *const ctx, istream_t *const in,
+                              u8 **const literals);
 
 // Decode the sequences part of a block
-static size_t decode_sequences(frame_context_t *const ctx, const u8 *const src,
-                               const size_t src_len,
+static size_t decode_sequences(frame_context_t *const ctx, istream_t *const in,
                                sequence_command_t **const sequences);
 
 // Execute the decoded sequences on the literals block
-static void execute_sequences(io_streams_t *const streams,
-                              frame_context_t *const ctx,
+static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
+                              const u8 *const literals,
+                              const size_t literals_len,
                               const sequence_command_t *const sequences,
-                              const size_t num_sequences,
-                              const u8 *literals,
-                              size_t literals_len);
+                              const size_t num_sequences);
 
 // Parse a provided dictionary blob for use in decompression
-static void parse_dictionary(dictionary_t *const dict, const u8 *const src,
-                             const size_t src_len);
+static void parse_dictionary(dictionary_t *const dict, const u8 *src,
+                             size_t src_len);
 static void free_dictionary(dictionary_t *const dict);
 /******* END ZSTD HELPER STRUCTS AND PROTOTYPES *******************************/
 
@@ -348,58 +384,46 @@ size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
         parse_dictionary(&parsed_dict, (const u8 *)dict, dict_len);
     }
 
-    io_streams_t streams = {(u8 *)dst, dst_len, (const u8 *)src, src_len};
-    while (streams.src_len > 0) {
-        decode_frame(&streams, &parsed_dict);
+    istream_t in = {(const u8 *)src, 0, src_len};
+    ostream_t out = {(u8 *)dst, dst_len};
+    while (IO_istream_len(&in) > 0) {
+        decode_frame(&out, &in, &parsed_dict);
     }
 
     free_dictionary(&parsed_dict);
 
-    return streams.dst - (u8 *)dst;
+    return out.ptr - (u8 *)dst;
 }
 
 /******* FRAME DECODING ******************************************************/
 
-static void decode_data_frame(io_streams_t *const streams,
+static void decode_data_frame(ostream_t *const out, istream_t *const in,
                               const dictionary_t *const dict);
-static void init_frame_context(io_streams_t *const streams,
-                               frame_context_t *const context,
+static void init_frame_context(frame_context_t *const context,
+                               istream_t *const in,
                                const dictionary_t *const dict);
 static void free_frame_context(frame_context_t *const context);
 static void parse_frame_header(frame_header_t *const header,
-                               const u8 *const src, const size_t src_len);
+                               istream_t *const in);
 static void frame_context_apply_dict(frame_context_t *const ctx,
                                      const dictionary_t *const dict);
 
-static void decompress_data(io_streams_t *const streams,
-                            frame_context_t *const ctx);
+static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
+                            istream_t *const in);
 
-static void decode_frame(io_streams_t *const streams,
+static void decode_frame(ostream_t *const out, istream_t *const in,
                          const dictionary_t *const dict) {
-    if (streams->src_len < 4) {
-        INP_SIZE();
-    }
-    const u32 magic_number = read_bits_LE(streams->src, 32, 0);
+    const u32 magic_number = IO_read_bits(in, 32);
 
-    streams->src += 4;
-    streams->src_len -= 4;
-    if (magic_number >= 0x184D2A50U && magic_number <= 0x184D2A5F) {
-        // skippable frame
-        if (streams->src_len < 4) {
-            INP_SIZE();
-        }
-        const size_t frame_size = read_bits_LE(streams->src, 32, 32);
-
-        if (streams->src_len < 4 + frame_size) {
-            INP_SIZE();
-        }
+    if ((magic_number & ~0xFU) == 0x184D2A50U) {
+        // Skippable frame
+        const size_t frame_size = IO_read_bits(in, 32);
 
         // skip over frame
-        streams->src += 4 + frame_size;
-        streams->src_len -= 4 + frame_size;
+        IO_advance_input(in, frame_size);
     } else if (magic_number == 0xFD2FB528U) {
         // ZSTD frame
-        decode_data_frame(streams, dict);
+        decode_data_frame(out, in, dict);
     } else {
         // not a real frame
         ERROR("Invalid magic number");
@@ -410,40 +434,38 @@ static void decode_frame(io_streams_t *const streams,
 /// are skippable frames.
 /// See
 /// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#general-structure-of-zstandard-frame-format
-static void decode_data_frame(io_streams_t *const streams,
+static void decode_data_frame(ostream_t *const out, istream_t *const in,
                               const dictionary_t *const dict) {
     frame_context_t ctx;
 
     // Initialize the context that needs to be carried from block to block
-    init_frame_context(streams, &ctx, dict);
+    init_frame_context(&ctx, in, dict);
 
     if (ctx.header.frame_content_size != 0 &&
-        ctx.header.frame_content_size > streams->dst_len) {
+        ctx.header.frame_content_size > out->len) {
         OUT_SIZE();
     }
 
-    decompress_data(streams, &ctx);
+    decompress_data(&ctx, out, in);
 
     free_frame_context(&ctx);
 }
 
 /// Takes the information provided in the header and dictionary, and initializes
 /// the context for this frame
-static void init_frame_context(io_streams_t *const streams,
-                               frame_context_t *const context,
+static void init_frame_context(frame_context_t *const context,
+                               istream_t *const in,
                                const dictionary_t *const dict) {
     // Most fields in context are correct when initialized to 0
-    memset(context, 0x00, sizeof(frame_context_t));
+    memset(context, 0, sizeof(frame_context_t));
 
     // Parse data from the frame header
-    parse_frame_header(&context->header, streams->src, streams->src_len);
-    streams->src += context->header.header_size;
-    streams->src_len -= context->header.header_size;
+    parse_frame_header(&context->header, in);
 
     // Set up the offset history for the repeat offset commands
-    context->previous_offsets[1] = 1;
-    context->previous_offsets[2] = 4;
-    context->previous_offsets[3] = 8;
+    context->previous_offsets[0] = 1;
+    context->previous_offsets[1] = 4;
+    context->previous_offsets[2] = 8;
 
     // Apply details from the dict if it exists
     frame_context_apply_dict(context, dict);
@@ -460,12 +482,8 @@ static void free_frame_context(frame_context_t *const context) {
 }
 
 static void parse_frame_header(frame_header_t *const header,
-                               const u8 *const src, const size_t src_len) {
-    if (src_len < 1) {
-        INP_SIZE();
-    }
-
-    const u8 descriptor = read_bits_LE(src, 8, 0);
+                               istream_t *const in) {
+    const u8 descriptor = IO_read_bits(in, 8);
 
     // decode frame header descriptor into flags
     const u8 frame_content_size_flag = descriptor >> 6;
@@ -478,28 +496,20 @@ static void parse_frame_header(frame_header_t *const header,
         CORRUPTION();
     }
 
-    int header_size = 1;
-
     header->single_segment_flag = single_segment_flag;
     header->content_checksum_flag = content_checksum_flag;
 
     // decode window size
     if (!single_segment_flag) {
-        if (src_len < header_size + 1) {
-            INP_SIZE();
-        }
-
         // Use the algorithm from the specification to compute window size
         // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
-        u8 window_descriptor = src[header_size];
+        u8 window_descriptor = IO_read_bits(in, 8);
         u8 exponent = window_descriptor >> 3;
         u8 mantissa = window_descriptor & 7;
 
         size_t window_base = (size_t)1 << (10 + exponent);
         size_t window_add = (window_base / 8) * mantissa;
         header->window_size = window_base + window_add;
-
-        header_size++;
     }
 
     // decode dictionary id if it exists
@@ -507,13 +517,7 @@ static void parse_frame_header(frame_header_t *const header,
         const int bytes_array[] = {0, 1, 2, 4};
         const int bytes = bytes_array[dictionary_id_flag];
 
-        if (src_len < header_size + bytes) {
-            INP_SIZE();
-        }
-
-        header->dictionary_id = read_bits_LE(src + header_size, bytes * 8, 0);
-
-        header_size += bytes;
+        header->dictionary_id = IO_read_bits(in, bytes * 8);
     } else {
         header->dictionary_id = 0;
     }
@@ -525,17 +529,10 @@ static void parse_frame_header(frame_header_t *const header,
         const int bytes_array[] = {1, 2, 4, 8};
         const int bytes = bytes_array[frame_content_size_flag];
 
-        if (src_len < header_size + bytes) {
-            INP_SIZE();
-        }
-
-        header->frame_content_size =
-            read_bits_LE(src + header_size, bytes * 8, 0);
+        header->frame_content_size = IO_read_bits(in, bytes * 8);
         if (bytes == 2) {
             header->frame_content_size += 256;
         }
-
-        header_size += bytes;
     } else {
         header->frame_content_size = 0;
     }
@@ -546,8 +543,6 @@ static void parse_frame_header(frame_header_t *const header,
         // back to the dictionary or not on large offsets
         header->window_size = header->frame_content_size;
     }
-
-    header->header_size = header_size;
 }
 
 /// A dictionary acts as initializing values for the frame context before
@@ -559,20 +554,15 @@ static void frame_context_apply_dict(frame_context_t *const ctx,
     if (!dict || !dict->content)
         return;
 
-    if (ctx->header.dictionary_id == 0 && dict->dictionary_id != 0) {
-        // The dictionary is unneeded, and shouldn't be used as it may interfere
-        // with the default offset history
-        return;
-    }
-
-    // If the dictionary id is 0, it doesn't matter if we provide the wrong raw
-    // content dict, it won't change anything
+    // If the requested dictionary_id is non-zero, the correct dictionary must
+    // be present
     if (ctx->header.dictionary_id != 0 &&
         ctx->header.dictionary_id != dict->dictionary_id) {
-        ERROR("Wrong/no dictionary provided");
+        ERROR("Wrong dictionary provided");
     }
 
-    // Copy the pointer in so we can reference it in sequence execution
+    // Copy the dict content to the context for references during sequence
+    // execution
     ctx->dict_content = dict->content;
     ctx->dict_content_len = dict->content_size;
 
@@ -592,188 +582,137 @@ static void frame_context_apply_dict(frame_context_t *const ctx,
 }
 
 /// Decompress the data from a frame block by block
-static void decompress_data(io_streams_t *const streams,
-                            frame_context_t *const ctx) {
+static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
+                            istream_t *const in) {
     int last_block = 0;
     do {
-        if (streams->src_len < 3) {
-            INP_SIZE();
-        }
         // Parse the block header
-        last_block = streams->src[0] & 1;
-        const int block_type = (streams->src[0] >> 1) & 3;
-        const size_t block_len = read_bits_LE(streams->src, 21, 3);
-
-        streams->src += 3;
-        streams->src_len -= 3;
+        last_block = IO_read_bits(in, 1);
+        const int block_type = IO_read_bits(in, 2);
+        const size_t block_len = IO_read_bits(in, 21);
 
         switch (block_type) {
         case 0: {
             // Raw, uncompressed block
-            if (streams->src_len < block_len) {
-                INP_SIZE();
-            }
-            if (streams->dst_len < block_len) {
-                OUT_SIZE();
-            }
-
+            const u8 *const read_ptr = IO_read_bytes(in, block_len);
+            u8 *const write_ptr = IO_write_bytes(out, block_len);
+            //
             // Copy the raw data into the output
-            memcpy(streams->dst, streams->src, block_len);
-
-            streams->src += block_len;
-            streams->src_len -= block_len;
-
-            streams->dst += block_len;
-            streams->dst_len -= block_len;
+            memcpy(write_ptr, read_ptr, block_len);
 
             ctx->current_total_output += block_len;
             break;
         }
         case 1: {
             // RLE block, repeat the first byte N times
-            if (streams->src_len < 1) {
-                INP_SIZE();
-            }
-            if (streams->dst_len < block_len) {
-                OUT_SIZE();
-            }
+            const u8 *const read_ptr = IO_read_bytes(in, 1);
+            u8 *const write_ptr = IO_write_bytes(out, block_len);
 
             // Copy `block_len` copies of `streams->src[0]` to the output
-            memset(streams->dst, streams->src[0], block_len);
-
-            streams->dst += block_len;
-            streams->dst_len -= block_len;
-
-            streams->src += 1;
-            streams->src_len -= 1;
+            memset(write_ptr, read_ptr[0], block_len);
 
             ctx->current_total_output += block_len;
             break;
         }
-        case 2:
-            // Compressed block, this is mode complex
-            decompress_block(streams, ctx, block_len);
+        case 2: {
+            // Compressed block
+            // Create a sub-stream for the block
+            istream_t block_stream = IO_make_sub_istream(in, block_len);
+            decompress_block(ctx, out, &block_stream);
             break;
+        }
         case 3:
             // Reserved block type
             CORRUPTION();
             break;
+        default:
+            IMPOSSIBLE();
         }
     } while (!last_block);
 
     if (ctx->header.content_checksum_flag) {
         // This program does not support checking the checksum, so skip over it
         // if it's present
-        if (streams->src_len < 4) {
-            INP_SIZE();
-        }
-        streams->src += 4;
-        streams->src_len -= 4;
+        IO_advance_input(in, 4);
     }
 }
 /******* END FRAME DECODING ***************************************************/
 
 /******* BLOCK DECOMPRESSION **************************************************/
-static void decompress_block(io_streams_t *const streams, frame_context_t *const ctx,
-                             const size_t block_len) {
-    if (streams->src_len < block_len) {
-        INP_SIZE();
-    }
-    // We need this to determine how long the compressed literals block was
-    const u8 *const end_of_block = streams->src + block_len;
-
+static void decompress_block(frame_context_t *const ctx, ostream_t *const out,
+                             istream_t *const in) {
     // Part 1: decode the literals block
     u8 *literals = NULL;
-    const size_t literals_size = decode_literals(streams, ctx, &literals);
+    const size_t literals_size = decode_literals(ctx, in, &literals);
 
     // Part 2: decode the sequences block
-    if (streams->src > end_of_block) {
-        INP_SIZE();
-    }
-    const size_t sequences_size = end_of_block - streams->src;
     sequence_command_t *sequences = NULL;
     const size_t num_sequences =
-        decode_sequences(ctx, streams->src, sequences_size, &sequences);
-
-    streams->src += sequences_size;
-    streams->src_len -= sequences_size;
+        decode_sequences(ctx, in, &sequences);
 
     // Part 3: combine literals and sequence commands to generate output
-    execute_sequences(streams, ctx, sequences, num_sequences, literals,
-                      literals_size);
+    execute_sequences(ctx, out, literals, literals_size, sequences,
+                      num_sequences);
     free(literals);
     free(sequences);
 }
 /******* END BLOCK DECOMPRESSION **********************************************/
 
 /******* LITERALS DECODING ****************************************************/
-static size_t decode_literals_simple(io_streams_t *const streams,
-                                     u8 **const literals, const int block_type,
+static size_t decode_literals_simple(istream_t *const in, u8 **const literals,
+                                     const int block_type,
                                      const int size_format);
-static size_t decode_literals_compressed(io_streams_t *const streams,
-                                         frame_context_t *const ctx,
+static size_t decode_literals_compressed(frame_context_t *const ctx,
+                                         istream_t *const in,
                                          u8 **const literals,
                                          const int block_type,
                                          const int size_format);
-static size_t decode_huf_table(const u8 *src, size_t src_len,
-                               HUF_dtable *const dtable);
-static size_t fse_decode_hufweights(const u8 *const src, const size_t src_len,
-                                    u8 *const weights, int *const num_symbs,
-                                    const size_t compressed_size);
+static void decode_huf_table(istream_t *const in, HUF_dtable *const dtable);
+static void fse_decode_hufweights(ostream_t *weights, istream_t *const in,
+                                    int *const num_symbs);
 
-static size_t decode_literals(io_streams_t *const streams,
-                              frame_context_t *const ctx, u8 **const literals) {
-    if (streams->src_len < 1) {
-        INP_SIZE();
-    }
+static size_t decode_literals(frame_context_t *const ctx, istream_t *const in,
+                              u8 **const literals) {
     // Decode literals header
-    int block_type = streams->src[0] & 3;
-    int size_format = (streams->src[0] >> 2) & 3;
+    int block_type = IO_read_bits(in, 2);
+    int size_format = IO_read_bits(in, 2);
 
     if (block_type <= 1) {
         // Raw or RLE literals block
-        return decode_literals_simple(streams, literals, block_type,
+        return decode_literals_simple(in, literals, block_type,
                                       size_format);
     } else {
         // Huffman compressed literals
-        return decode_literals_compressed(streams, ctx, literals, block_type,
+        return decode_literals_compressed(ctx, in, literals, block_type,
                                           size_format);
     }
 }
 
 /// Decodes literals blocks in raw or RLE form
-static size_t decode_literals_simple(io_streams_t *const streams,
-                                     u8 **const literals, const int block_type,
+static size_t decode_literals_simple(istream_t *const in, u8 **const literals,
+                                     const int block_type,
                                      const int size_format) {
     size_t size;
     switch (size_format) {
-    // These cases are in the form X0
-    // In this case, the X bit is actually part of the size field
+    // These cases are in the form ?0
+    // In this case, the ? bit is actually part of the size field
     case 0:
     case 2:
-        size = read_bits_LE(streams->src, 5, 3);
-        streams->src += 1;
-        streams->src_len -= 1;
+        // "Size_Format uses 1 bit. Regenerated_Size uses 5 bits (0-31)."
+        IO_rewind_bits(in, 1);
+        size = IO_read_bits(in, 2);
         break;
     case 1:
-        if (streams->src_len < 2) {
-            INP_SIZE();
-        }
-        size = read_bits_LE(streams->src, 12, 4);
-        streams->src += 2;
-        streams->src_len -= 2;
+        // "Size_Format uses 2 bits. Regenerated_Size uses 12 bits (0-4095)."
+        size = IO_read_bits(in, 12);
         break;
     case 3:
-        if (streams->src_len < 2) {
-            INP_SIZE();
-        }
-        size = read_bits_LE(streams->src, 20, 4);
-        streams->src += 3;
-        streams->src_len -= 3;
+        // "Size_Format uses 2 bits. Regenerated_Size uses 20 bits (0-1048575)."
+        size = IO_read_bits(in, 20);
         break;
     default:
-        // Impossible
-        size = -1;
+        // Size format is in range 0-3
+        IMPOSSIBLE();
     }
 
     if (size > MAX_LITERALS_SIZE) {
@@ -786,32 +725,28 @@ static size_t decode_literals_simple(io_streams_t *const streams,
     }
 
     switch (block_type) {
-    case 0:
+    case 0: {
         // Raw data
-        if (size > streams->src_len) {
-            INP_SIZE();
-        }
-        memcpy(*literals, streams->src, size);
-        streams->src += size;
-        streams->src_len -= size;
+        const u8 *const read_ptr = IO_read_bytes(in, size);
+        memcpy(*literals, read_ptr, size);
         break;
-    case 1:
+    }
+    case 1: {
         // Single repeated byte
-        if (1 > streams->src_len) {
-            INP_SIZE();
-        }
-        memset(*literals, streams->src[0], size);
-        streams->src += 1;
-        streams->src_len -= 1;
+        const u8 *const read_ptr = IO_read_bytes(in, 1);
+        memset(*literals, read_ptr[0], size);
         break;
     }
+    default:
+        IMPOSSIBLE();
+    }
 
     return size;
 }
 
 /// Decodes Huffman compressed literals
-static size_t decode_literals_compressed(io_streams_t *const streams,
-                                         frame_context_t *const ctx,
+static size_t decode_literals_compressed(frame_context_t *const ctx,
+                                         istream_t *const in,
                                          u8 **const literals,
                                          const int block_type,
                                          const int size_format) {
@@ -820,98 +755,78 @@ static size_t decode_literals_compressed(io_streams_t *const streams,
     int num_streams = 4;
     switch (size_format) {
     case 0:
+        // "A single stream. Both Compressed_Size and Regenerated_Size use 10
+        // bits (0-1023)."
         num_streams = 1;
     // Fall through as it has the same size format
     case 1:
-        if (streams->src_len < 3) {
-            INP_SIZE();
-        }
-        regenerated_size = read_bits_LE(streams->src, 10, 4);
-        compressed_size = read_bits_LE(streams->src, 10, 14);
-        streams->src += 3;
-        streams->src_len -= 3;
+        // "4 streams. Both Compressed_Size and Regenerated_Size use 10 bits
+        // (0-1023)."
+        regenerated_size = IO_read_bits(in, 10);
+        compressed_size = IO_read_bits(in, 10);
         break;
     case 2:
-        if (streams->src_len < 4) {
-            INP_SIZE();
-        }
-        regenerated_size = read_bits_LE(streams->src, 14, 4);
-        compressed_size = read_bits_LE(streams->src, 14, 18);
-        streams->src += 4;
-        streams->src_len -= 4;
+        // "4 streams. Both Compressed_Size and Regenerated_Size use 14 bits
+        // (0-16383)."
+        regenerated_size = IO_read_bits(in, 14);
+        compressed_size = IO_read_bits(in, 14);
         break;
     case 3:
-        if (streams->src_len < 5) {
-            INP_SIZE();
-        }
-        regenerated_size = read_bits_LE(streams->src, 18, 4);
-        compressed_size = read_bits_LE(streams->src, 18, 22);
-        streams->src += 5;
-        streams->src_len -= 5;
+        // "4 streams. Both Compressed_Size and Regenerated_Size use 18 bits
+        // (0-262143)."
+        regenerated_size = IO_read_bits(in, 18);
+        compressed_size = IO_read_bits(in, 18);
         break;
     default:
         // Impossible
-        compressed_size = regenerated_size = -1;
+        IMPOSSIBLE();
     }
     if (regenerated_size > MAX_LITERALS_SIZE ||
         compressed_size > regenerated_size) {
         CORRUPTION();
     }
 
-    if (compressed_size > streams->src_len) {
-        INP_SIZE();
-    }
-
     *literals = malloc(regenerated_size);
     if (!*literals) {
         BAD_ALLOC();
     }
 
+    ostream_t lit_stream = IO_make_ostream(*literals, regenerated_size);
+    istream_t huf_stream = IO_make_sub_istream(in, compressed_size);
+
     if (block_type == 2) {
         // Decode provided Huffman table
 
         HUF_free_dtable(&ctx->literals_dtable);
-        const size_t size = decode_huf_table(streams->src, compressed_size,
-                                             &ctx->literals_dtable);
-        streams->src += size;
-        streams->src_len -= size;
-        compressed_size -= size;
+        decode_huf_table(&huf_stream, &ctx->literals_dtable);
     } else {
-        // If we're to repeat the previous Huffman table, make sure it exists
+        // If the previous Huffman table is being repeated, ensure it exists
         if (!ctx->literals_dtable.symbols) {
             CORRUPTION();
         }
     }
 
+    size_t symbols_decoded;
     if (num_streams == 1) {
-        HUF_decompress_1stream(&ctx->literals_dtable, *literals,
-                               regenerated_size, streams->src, compressed_size);
+        symbols_decoded = HUF_decompress_1stream(&ctx->literals_dtable, &lit_stream, &huf_stream);
     } else {
-        HUF_decompress_4stream(&ctx->literals_dtable, *literals,
-                               regenerated_size, streams->src, compressed_size);
+        symbols_decoded = HUF_decompress_4stream(&ctx->literals_dtable, &lit_stream, &huf_stream);
+    }
+
+    if (symbols_decoded != regenerated_size) {
+        CORRUPTION();
     }
-    streams->src += compressed_size;
-    streams->src_len -= compressed_size;
 
     return regenerated_size;
 }
 
 // Decode the Huffman table description
-static size_t decode_huf_table(const u8 *src, size_t src_len,
-                               HUF_dtable *const dtable) {
-    if (src_len < 1) {
-        INP_SIZE();
-    }
+static void decode_huf_table(istream_t *const in, HUF_dtable *const dtable) {
+    const u8 header = IO_read_bits(in, 8);
 
-    const u8 *const osrc = src;
-
-    const u8 header = src[0];
     u8 weights[HUF_MAX_SYMBS];
     memset(weights, 0, sizeof(weights));
 
-    src++;
-    src_len--;
-
     int num_symbs;
 
     if (header >= 128) {
@@ -919,67 +834,56 @@ static size_t decode_huf_table(const u8 *src, size_t src_len,
         num_symbs = header - 127;
         const size_t bytes = (num_symbs + 1) / 2;
 
-        if (bytes > src_len) {
-            INP_SIZE();
-        }
+        const u8 *const weight_src = IO_read_bytes(in, bytes);
 
         for (int i = 0; i < num_symbs; i++) {
             // read_bits_LE isn't applicable here because the weights are order
             // reversed within each byte
             // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-header
             if (i % 2 == 0) {
-                weights[i] = src[i / 2] >> 4;
+                weights[i] = weight_src[i / 2] >> 4;
             } else {
-                weights[i] = src[i / 2] & 0xf;
+                weights[i] = weight_src[i / 2] & 0xf;
             }
         }
-
-        src += bytes;
-        src_len -= bytes;
     } else {
         // The weights are FSE encoded, decode them before we can construct the
         // table
-        const size_t size =
-            fse_decode_hufweights(src, src_len, weights, &num_symbs, header);
-        src += size;
-        src_len -= size;
+        istream_t fse_stream = IO_make_sub_istream(in, header);
+        ostream_t weight_stream = IO_make_ostream(weights, HUF_MAX_SYMBS);
+        fse_decode_hufweights(&weight_stream, &fse_stream, &num_symbs);
     }
 
     // Construct the table using the decoded weights
     HUF_init_dtable_usingweights(dtable, weights, num_symbs);
-    return src - osrc;
 }
 
-static size_t fse_decode_hufweights(const u8 *const src, const size_t src_len,
-                                    u8 *const weights, int *const num_symbs,
-                                    const size_t compressed_size) {
+static void fse_decode_hufweights(ostream_t *weights, istream_t *const in,
+                                    int *const num_symbs) {
     const int MAX_ACCURACY_LOG = 7;
 
     FSE_dtable dtable;
 
     // Construct the FSE table
-    const size_t read =
-            FSE_decode_header(&dtable, src, src_len, MAX_ACCURACY_LOG);
-
-    if (src_len < compressed_size) {
-        INP_SIZE();
-    }
+    FSE_decode_header(&dtable, in, MAX_ACCURACY_LOG);
 
     // Decode the weights
-    *num_symbs = FSE_decompress_interleaved2(
-        &dtable, weights, HUF_MAX_SYMBS, src + read, compressed_size - read);
+    *num_symbs = FSE_decompress_interleaved2(&dtable, weights, in);
 
     FSE_free_dtable(&dtable);
-
-    return compressed_size;
 }
 /******* END LITERALS DECODING ************************************************/
 
 /******* SEQUENCE DECODING ****************************************************/
 /// The combination of FSE states needed to decode sequences
 typedef struct {
-    u16 ll_state, of_state, ml_state;
-    FSE_dtable ll_table, of_table, ml_table;
+    FSE_dtable ll_table;
+    FSE_dtable of_table;
+    FSE_dtable ml_table;
+
+    u16 ll_state;
+    u16 of_state;
+    u16 ml_state;
 } sequence_state_t;
 
 /// Different modes to signal to decode_seq_tables what to do
@@ -1031,47 +935,36 @@ static const u8 SEQ_MATCH_LENGTH_EXTRA_BITS[53] = {
 /// Offset decoding is simpler so we just need a maximum code value
 static const u8 SEQ_MAX_CODES[3] = {35, -1, 52};
 
-static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
-                                 size_t src_len,
+static void decompress_sequences(frame_context_t *const ctx,
+                                 istream_t *const in,
                                  sequence_command_t *const sequences,
                                  const size_t num_sequences);
 static sequence_command_t decode_sequence(sequence_state_t *const state,
                                           const u8 *const src,
                                           i64 *const offset);
-static size_t decode_seq_table(const u8 *src, size_t src_len,
-                               FSE_dtable *const table, const seq_part_t type,
-                               const seq_mode_t mode);
+static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
+                               const seq_part_t type, const seq_mode_t mode);
 
-static size_t decode_sequences(frame_context_t *const ctx, const u8 *src,
-                               size_t src_len,
+static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
                                sequence_command_t **const sequences) {
     size_t num_sequences;
 
     // Decode the sequence header and allocate space for the output
-    if (src_len < 1) {
-        INP_SIZE();
-    }
-    if (src[0] == 0) {
+    u8 header = IO_read_bits(in, 8);
+    if (header == 0) {
+        // "There are no sequences. The sequence section stops there.
+        // Regenerated content is defined entirely by literals section."
         *sequences = NULL;
         return 0;
-    } else if (src[0] < 128) {
-        num_sequences = src[0];
-        src++;
-        src_len--;
-    } else if (src[0] < 255) {
-        if (src_len < 2) {
-            INP_SIZE();
-        }
-        num_sequences = ((src[0] - 128) << 8) + src[1];
-        src += 2;
-        src_len -= 2;
+    } else if (header < 128) {
+        // "Number_of_Sequences = byte0 . Uses 1 byte."
+        num_sequences = header;
+    } else if (header < 255) {
+        // "Number_of_Sequences = ((byte0-128) << 8) + byte1 . Uses 2 bytes."
+        num_sequences = ((header - 128) << 8) + IO_read_bits(in, 8);
     } else {
-        if (src_len < 3) {
-            INP_SIZE();
-        }
-        num_sequences = src[1] + ((u64)src[2] << 8) + 0x7F00;
-        src += 3;
-        src_len -= 3;
+        // "Number_of_Sequences = byte1 + (byte2<<8) + 0x7F00 . Uses 3 bytes."
+        num_sequences = IO_read_bits(in, 16) + 0x7F00;
     }
 
     *sequences = malloc(num_sequences * sizeof(sequence_command_t));
@@ -1079,51 +972,29 @@ static size_t decode_sequences(frame_context_t *const ctx, const u8 *src,
         BAD_ALLOC();
     }
 
-    decompress_sequences(ctx, src, src_len, *sequences, num_sequences);
+    decompress_sequences(ctx, in, *sequences, num_sequences);
     return num_sequences;
 }
 
 /// Decompress the FSE encoded sequence commands
-static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
-                                 size_t src_len,
+static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
                                  sequence_command_t *const sequences,
                                  const size_t num_sequences) {
-    if (src_len < 1) {
-        INP_SIZE();
-    }
-    u8 compression_modes = src[0];
-    src++;
-    src_len--;
+    u8 compression_modes = IO_read_bits(in, 8);
 
     if ((compression_modes & 3) != 0) {
         CORRUPTION();
     }
 
-    {
-        size_t read;
-        // Update the tables we have stored in the context
-        read = decode_seq_table(src, src_len, &ctx->ll_dtable,
-                                seq_literal_length,
-                                (compression_modes >> 6) & 3);
-        src += read;
-        src_len -= read;
-    }
+    // Update the tables we have stored in the context
+    decode_seq_table(in, &ctx->ll_dtable, seq_literal_length,
+                     (compression_modes >> 6) & 3);
 
-    {
-        const size_t read =
-                decode_seq_table(src, src_len, &ctx->of_dtable, seq_offset,
-                                 (compression_modes >> 4) & 3);
-        src += read;
-        src_len -= read;
-    }
+    decode_seq_table(in, &ctx->of_dtable, seq_offset,
+                     (compression_modes >> 4) & 3);
 
-    {
-        const size_t read = decode_seq_table(src, src_len, &ctx->ml_dtable,
-                                             seq_match_length,
-                                             (compression_modes >> 2) & 3);
-        src += read;
-        src_len -= read;
-    }
+    decode_seq_table(in, &ctx->ml_dtable, seq_match_length,
+                     (compression_modes >> 2) & 3);
 
     // Check to make sure none of the tables are uninitialized
     if (!ctx->ll_dtable.symbols || !ctx->of_dtable.symbols ||
@@ -1137,8 +1008,13 @@ static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
     memcpy(&state.of_table, &ctx->of_dtable, sizeof(FSE_dtable));
     memcpy(&state.ml_table, &ctx->ml_dtable, sizeof(FSE_dtable));
 
-    const int padding = 8 - log2inf(src[src_len - 1]);
-    i64 offset = src_len * 8 - padding;
+    size_t len = IO_istream_len(in);
+    const u8 *const src = IO_read_bytes(in, len);
+
+    // "After writing the last bit containing information, the compressor writes
+    // a single 1-bit and then fills the byte with 0-7 0 bits of padding."
+    const int padding = 8 - log2inf(src[len - 1]);
+    i64 offset = len * 8 - padding;
 
     FSE_init_state(&state.ll_table, &state.ll_state, src, &offset);
     FSE_init_state(&state.of_table, &state.of_state, src, &offset);
@@ -1153,7 +1029,7 @@ static void decompress_sequences(frame_context_t *const ctx, const u8 *src,
         CORRUPTION();
     }
 
-    // Don't free our tables so they can be used in the next block
+    // Don't free tables so they can be used in the next block
 }
 
 // Decode a single sequence and update the state
@@ -1194,9 +1070,8 @@ static sequence_command_t decode_sequence(sequence_state_t *const state,
 }
 
 /// Given a sequence part and table mode, decode the FSE distribution
-static size_t decode_seq_table(const u8 *src, size_t src_len,
-                               FSE_dtable *const table, const seq_part_t type,
-                               const seq_mode_t mode) {
+static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
+                               const seq_part_t type, const seq_mode_t mode) {
     // Constant arrays indexed by seq_part_t
     const i16 *const default_distributions[] = {SEQ_LITERAL_LENGTH_DEFAULT_DIST,
                                                 SEQ_OFFSET_DEFAULT_DIST,
@@ -1207,7 +1082,7 @@ static size_t decode_seq_table(const u8 *src, size_t src_len,
     const size_t max_accuracies[] = {9, 8, 9};
 
     if (mode != seq_repeat) {
-        // ree old one before overwriting
+        // Free old one before overwriting
         FSE_free_dtable(table);
     }
 
@@ -1218,102 +1093,102 @@ static size_t decode_seq_table(const u8 *src, size_t src_len,
         const size_t accuracy_log = default_distribution_accuracies[type];
 
         FSE_init_dtable(table, distribution, symbs, accuracy_log);
-
-        return 0;
+        break;
     }
     case seq_rle: {
-        if (src_len < 1) {
-            INP_SIZE();
-        }
-        const u8 symb = src[0];
-        src++;
-        src_len--;
+        const u8 symb = IO_read_bits(in, 8);
         FSE_init_dtable_rle(table, symb);
-
-        return 1;
+        break;
     }
     case seq_fse: {
-        size_t read =
-            FSE_decode_header(table, src, src_len, max_accuracies[type]);
-        src += read;
-        src_len -= read;
-
-        return read;
+        FSE_decode_header(table, in, max_accuracies[type]);
+        break;
     }
     case seq_repeat:
-        // Don't have to do anything here as we're not changing the table
-        return 0;
+        // Nothing to do here, table will be unchanged
+        break;
     default:
         // Impossible, as mode is from 0-3
-        return -1;
+        IMPOSSIBLE();
+        break;
     }
 }
 /******* END SEQUENCE DECODING ************************************************/
 
 /******* SEQUENCE EXECUTION ***************************************************/
-static void execute_sequences(io_streams_t *const streams,
-                              frame_context_t *const ctx,
+static void execute_sequences(frame_context_t *const ctx, ostream_t *const out,
+                              const u8 *const literals,
+                              const size_t literals_len,
                               const sequence_command_t *const sequences,
-                              const size_t num_sequences,
-                              const u8 *literals,
-                              size_t literals_len) {
+                              const size_t num_sequences) {
+    istream_t litstream = IO_make_istream(literals, literals_len);
+
     u64 *const offset_hist = ctx->previous_offsets;
     size_t total_output = ctx->current_total_output;
 
     for (size_t i = 0; i < num_sequences; i++) {
         const sequence_command_t seq = sequences[i];
 
-        if (seq.literal_length > literals_len) {
-            CORRUPTION();
+        {
+            if (seq.literal_length > IO_istream_len(&litstream)) {
+                CORRUPTION();
+            }
+
+            u8 *const write_ptr = IO_write_bytes(out, seq.literal_length);
+            const u8 *const read_ptr =
+                    IO_read_bytes(&litstream, seq.literal_length);
+            // Copy literals to output
+            memcpy(write_ptr, read_ptr, seq.literal_length);
+
+            total_output += seq.literal_length;
         }
 
-        if (streams->dst_len < seq.literal_length + seq.match_length) {
-            OUT_SIZE();
-        }
-        // Copy literals to output
-        memcpy(streams->dst, literals, seq.literal_length);
-
-        literals += seq.literal_length;
-        literals_len -= seq.literal_length;
-
-        streams->dst += seq.literal_length;
-        streams->dst_len -= seq.literal_length;
-
-        total_output += seq.literal_length;
-
         size_t offset;
 
         // Offsets are special, we need to handle the repeat offsets
         if (seq.offset <= 3) {
-            u32 idx = seq.offset;
+            // "The first 3 values define a repeated offset and we will call
+            // them Repeated_Offset1, Repeated_Offset2, and Repeated_Offset3.
+            // They are sorted in recency order, with Repeated_Offset1 meaning
+            // 'most recent one'".
+
+            // Use 0 indexing for the array
+            u32 idx = seq.offset - 1;
             if (seq.literal_length == 0) {
-                // Special case when literal length is 0
+                // "There is an exception though, when current sequence's
+                // literals length is 0. In this case, repeated offsets are
+                // shifted by one, so Repeated_Offset1 becomes Repeated_Offset2,
+                // Repeated_Offset2 becomes Repeated_Offset3, and
+                // Repeated_Offset3 becomes Repeated_Offset1 - 1_byte."
                 idx++;
             }
 
-            if (idx == 1) {
-                offset = offset_hist[1];
+            if (idx == 0) {
+                offset = offset_hist[0];
             } else {
-                // If idx == 4 then literal length was 0 and the offset was 3
-                offset = idx < 4 ? offset_hist[idx] : offset_hist[1] - 1;
+                // If idx == 3 then literal length was 0 and the offset was 3,
+                // as per the exception listed above
+                offset = idx < 3 ? offset_hist[idx] : offset_hist[0] - 1;
 
-                // If idx == 2 we don't need to modify offset_hist[3]
-                if (idx > 2) {
-                    offset_hist[3] = offset_hist[2];
+                // If idx == 1 we don't need to modify offset_hist[2]
+                if (idx > 1) {
+                    offset_hist[2] = offset_hist[1];
                 }
-                offset_hist[2] = offset_hist[1];
-                offset_hist[1] = offset;
+                offset_hist[1] = offset_hist[0];
+                offset_hist[0] = offset;
             }
         } else {
             offset = seq.offset - 3;
 
             // Shift back history
-            offset_hist[3] = offset_hist[2];
             offset_hist[2] = offset_hist[1];
-            offset_hist[1] = offset;
+            offset_hist[1] = offset_hist[0];
+            offset_hist[0] = offset;
         }
 
         size_t match_length = seq.match_length;
+
+        u8 *write_ptr = IO_write_bytes(out, match_length);
         if (total_output <= ctx->header.window_size) {
             // In this case offset might go back into the dictionary
             if (offset > total_output + ctx->dict_content_len) {
@@ -1322,13 +1197,16 @@ static void execute_sequences(io_streams_t *const streams,
             }
 
             if (offset > total_output) {
+                // "The rest of the dictionary is its content. The content act
+                // as a "past" in front of data to compress or decompress, so it
+                // can be referenced in sequence commands."
                 const size_t dict_copy =
                     MIN(offset - total_output, match_length);
                 const size_t dict_offset =
                     ctx->dict_content_len - (offset - total_output);
-                for (size_t i = 0; i < dict_copy; i++) {
-                    *streams->dst++ = ctx->dict_content[dict_offset + i];
-                }
+
+                memcpy(write_ptr, ctx->dict_content + dict_offset, dict_copy);
+                write_ptr += dict_copy;
                 match_length -= dict_copy;
             }
         } else if (offset > ctx->header.window_size) {
@@ -1340,31 +1218,29 @@ static void execute_sequences(io_streams_t *const streams,
         // ex: if the output so far was "abc", a command with offset=3 and
         // match_length=6 would produce "abcabcabc" as the new output
         for (size_t i = 0; i < match_length; i++) {
-            *streams->dst = *(streams->dst - offset);
-            streams->dst++;
+            *write_ptr = *(write_ptr - offset);
+            write_ptr++;
         }
 
-        streams->dst_len -= seq.match_length;
         total_output += seq.match_length;
     }
 
-    if (streams->dst_len < literals_len) {
-        OUT_SIZE();
-    }
-    // Copy any leftover literals
-    memcpy(streams->dst, literals, literals_len);
-    streams->dst += literals_len;
-    streams->dst_len -= literals_len;
+    {
+        size_t len = IO_istream_len(&litstream);
+        u8 *const write_ptr = IO_write_bytes(out, len);
+        const u8 *const read_ptr = IO_read_bytes(&litstream, len);
+        // Copy any leftover literals
+        memcpy(write_ptr, read_ptr, len);
 
-    total_output += literals_len;
+        total_output += len;
+    }
 
     ctx->current_total_output = total_output;
 }
 /******* END SEQUENCE EXECUTION ***********************************************/
 
 /******* OUTPUT SIZE COUNTING *************************************************/
-size_t traverse_frame(const frame_header_t *const header, const u8 *src,
-                      size_t src_len);
+static void traverse_frame(const frame_header_t *const header, istream_t *const in);
 
 /// Get the decompressed size of an input stream so memory can be allocated in
 /// advance.
@@ -1372,115 +1248,75 @@ size_t traverse_frame(const frame_header_t *const header, const u8 *src,
 /// implementation, as this API allows for the decompression of multiple
 /// concatenated frames.
 size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) {
-  const u8 *ip = (const u8 *) src;
-  size_t ip_len = src_len;
-  size_t dst_size = 0;
+    istream_t in = IO_make_istream(src, src_len);
+    size_t dst_size = 0;
 
-  // Each frame header only gives us the size of its frame, so iterate over all
-  // frames
-  while (ip_len > 0) {
-    if (ip_len < 4) {
-      INP_SIZE();
+    // Each frame header only gives us the size of its frame, so iterate over
+    // all
+    // frames
+    while (IO_istream_len(&in) > 0) {
+        const u32 magic_number = IO_read_bits(&in, 32);
+
+        if ((magic_number & ~0xFU) == 0x184D2A50U) {
+            // skippable frame, this has no impact on output size
+            const size_t frame_size = IO_read_bits(&in, 32);
+            IO_advance_input(&in, frame_size);
+        } else if (magic_number == 0xFD2FB528U) {
+            // ZSTD frame
+            frame_header_t header;
+            parse_frame_header(&header, &in);
+
+            if (header.frame_content_size == 0 && !header.single_segment_flag) {
+                // Content size not provided, we can't tell
+                return -1;
+            }
+
+            dst_size += header.frame_content_size;
+
+            // Consume the input from the frame to reach the start of the next
+            traverse_frame(&header, &in);
+        } else {
+            // not a real frame
+            ERROR("Invalid magic number");
+        }
     }
 
-    const u32 magic_number = read_bits_LE(ip, 32, 0);
-
-    ip += 4;
-    ip_len -= 4;
-    if (magic_number >= 0x184D2A50U && magic_number <= 0x184D2A5F) {
-        // skippable frame, this has no impact on output size
-        if (ip_len < 4) {
-            INP_SIZE();
-        }
-        const size_t frame_size = read_bits_LE(ip, 32, 32);
-
-        if (ip_len < 4 + frame_size) {
-            INP_SIZE();
-        }
-
-        // skip over frame
-        ip += 4 + frame_size;
-        ip_len -= 4 + frame_size;
-    } else if (magic_number == 0xFD2FB528U) {
-        // ZSTD frame
-        frame_header_t header;
-        parse_frame_header(&header, ip, ip_len);
-
-        if (header.frame_content_size == 0 && !header.single_segment_flag) {
-            // Content size not provided, we can't tell
-            return -1;
-        }
-
-        dst_size += header.frame_content_size;
-
-        // we need to traverse the frame to find when the next one starts
-        const size_t traversed = traverse_frame(&header, ip, ip_len);
-        ip += traversed;
-        ip_len -= traversed;
-    } else {
-        // not a real frame
-        ERROR("Invalid magic number");
-    }
-  }
-
-  return dst_size;
+    return dst_size;
 }
 
 /// Iterate over each block in a frame to find the end of it, to get to the
 /// start of the next frame
-size_t traverse_frame(const frame_header_t *const header, const u8 *src,
-                      size_t src_len) {
-    const u8 *const src_beg = src;
-    const u8 *const src_end = src + src_len;
-    src += header->header_size;
-    src_len += header->header_size;
-
+static void traverse_frame(const frame_header_t *const header, istream_t *const in) {
     int last_block = 0;
 
     do {
-        if (src + 3 > src_end) {
-            INP_SIZE();
-        }
         // Parse the block header
-        last_block = src[0] & 1;
-        const int block_type = (src[0] >> 1) & 3;
-        const size_t block_len = read_bits_LE(src, 21, 3);
+        last_block = IO_read_bits(in, 1);
+        const int block_type = IO_read_bits(in, 2);
+        const size_t block_len = IO_read_bits(in, 21);
 
-        src += 3;
         switch (block_type) {
         case 0: // Raw block, block_len bytes
-            if (src + block_len > src_end) {
-                INP_SIZE();
-            }
-            src += block_len;
+            IO_advance_input(in, block_len);
             break;
         case 1: // RLE block, 1 byte
-            if (src + 1 > src_end) {
-                INP_SIZE();
-            }
-            src++;
+            IO_advance_input(in, 1);
             break;
         case 2: // Compressed block, compressed size is block_len
-            if (src + block_len > src_end) {
-                INP_SIZE();
-            }
-            src += block_len;
+            IO_advance_input(in, block_len);
             break;
         case 3:
             // Reserved block type
             CORRUPTION();
             break;
+        default:
+            IMPOSSIBLE();
         }
     } while (!last_block);
 
     if (header->content_checksum_flag) {
-        if (src + 4 > src_end) {
-            INP_SIZE();
-        }
-        src += 4;
+        IO_advance_input(in, 4);
     }
-
-    return src - src_beg;
 }
 
 /******* END OUTPUT SIZE COUNTING *********************************************/
@@ -1495,68 +1331,46 @@ static void parse_dictionary(dictionary_t *const dict, const u8 *src,
     if (src_len < 8) {
         INP_SIZE();
     }
-    const u32 magic_number = read_bits_LE(src, 32, 0);
+
+    istream_t in = IO_make_istream(src, src_len);
+
+    const u32 magic_number = IO_read_bits(&in, 32);
     if (magic_number != 0xEC30A437) {
         // raw content dict
         init_raw_content_dict(dict, src, src_len);
         return;
     }
-    dict->dictionary_id = read_bits_LE(src, 32, 32);
 
-    src += 8;
-    src_len -= 8;
+    dict->dictionary_id = IO_read_bits(&in, 32);
 
     // Parse the provided entropy tables in order
-    {
-        const size_t read =
-                decode_huf_table(src, src_len, &dict->literals_dtable);
-        src += read;
-        src_len -= read;
-    }
-    {
-        const size_t read = decode_seq_table(src, src_len, &dict->of_dtable,
-                                             seq_offset, seq_fse);
-        src += read;
-        src_len -= read;
-    }
-    {
-        const size_t read = decode_seq_table(src, src_len, &dict->ml_dtable,
-                                             seq_match_length, seq_fse);
-        src += read;
-        src_len -= read;
-    }
-    {
-        const size_t read = decode_seq_table(src, src_len, &dict->ll_dtable,
-                                             seq_literal_length, seq_fse);
-        src += read;
-        src_len -= read;
-    }
+    decode_huf_table(&in, &dict->literals_dtable);
+    decode_seq_table(&in, &dict->of_dtable, seq_offset, seq_fse);
+    decode_seq_table(&in, &dict->ml_dtable, seq_match_length, seq_fse);
+    decode_seq_table(&in, &dict->ll_dtable, seq_literal_length, seq_fse);
 
-    if (src_len < 12) {
-        INP_SIZE();
-    }
     // Read in the previous offset history
-    dict->previous_offsets[1] = read_bits_LE(src, 32, 0);
-    dict->previous_offsets[2] = read_bits_LE(src, 32, 32);
-    dict->previous_offsets[3] = read_bits_LE(src, 32, 64);
-
-    src += 12;
-    src_len -= 12;
+    dict->previous_offsets[0] = IO_read_bits(&in, 32);
+    dict->previous_offsets[1] = IO_read_bits(&in, 32);
+    dict->previous_offsets[2] = IO_read_bits(&in, 32);
 
     // Ensure the provided offsets aren't too large
-    for (int i = 1; i <= 3; i++) {
+    for (int i = 0; i < 3; i++) {
         if (dict->previous_offsets[i] > src_len) {
             ERROR("Dictionary corrupted");
         }
     }
+
     // The rest is the content
-    dict->content = malloc(src_len);
+    dict->content_size = IO_istream_len(&in);
+    dict->content = malloc(dict->content_size);
     if (!dict->content) {
         BAD_ALLOC();
     }
 
-    dict->content_size = src_len;
-    memcpy(dict->content, src, src_len);
+    const u8 *const content = IO_read_bytes(&in, dict->content_size);
+
+    memcpy(dict->content, content, dict->content_size);
 }
 
 /// If parse_dictionary is given a raw content dictionary, it delegates here
@@ -1586,6 +1400,143 @@ static void free_dictionary(dictionary_t *const dict) {
 }
 /******* END DICTIONARY PARSING ***********************************************/
 
+/******* IO STREAM OPERATIONS *************************************************/
+#define UNALIGNED() ERROR("Attempting to operate on a non-byte aligned stream")
+/// Reads `num` bits from a bitstream, and updates the internal offset
+static inline u64 IO_read_bits(istream_t *const in, const int num) {
+    if (num > 64) {
+        return -1;
+    }
+
+    const size_t bytes = (num + in->bit_offset + 7) / 8;
+    const size_t full_bytes = (num + in->bit_offset) / 8;
+    if (bytes > in->len) {
+        INP_SIZE();
+    }
+
+    const u64 result = read_bits_LE(in->ptr, num, in->bit_offset);
+
+    in->bit_offset = (num + in->bit_offset) % 8;
+    in->ptr += full_bytes;
+    in->len -= full_bytes;
+
+    return result;
+}
+
+/// If a non-zero number of bits have been read from the current byte, advance
+/// the offset to the next byte
+static inline void IO_rewind_bits(istream_t *const in, int num) {
+    if (num < 0) {
+        ERROR("Attempting to rewind stream by a negative number of bits");
+    }
+
+    const int new_offset = in->bit_offset - num;
+    const i64 bytes = (new_offset - 7) / 8;
+
+    in->ptr += bytes;
+    in->len -= bytes;
+    in->bit_offset = ((new_offset % 8) + 8) % 8;
+}
+
+/// If the remaining bits in a byte will be unused, advance to the end of the
+/// byte
+static inline void IO_align_stream(istream_t *const in) {
+    if (in->bit_offset != 0) {
+        if (in->len == 0) {
+            INP_SIZE();
+        }
+        in->ptr++;
+        in->len--;
+        in->bit_offset = 0;
+    }
+}
+
+/// Write the given byte into the output stream
+static inline void IO_write_byte(ostream_t *const out, u8 symb) {
+    if (out->len == 0) {
+        OUT_SIZE();
+    }
+
+    out->ptr[0] = symb;
+    out->ptr++;
+    out->len--;
+}
+
+/// Returns the number of bytes left to be read in this stream.  The stream must
+/// be byte aligned.
+static inline size_t IO_istream_len(const istream_t *const in) {
+    return in->len;
+}
+
+/// Returns a pointer where `len` bytes can be read, and advances the internal
+/// state.  The stream must be byte aligned.
+static inline const u8 *IO_read_bytes(istream_t *const in, size_t len) {
+    if (len > in->len) {
+        INP_SIZE();
+    }
+    if (in->bit_offset != 0) {
+        UNALIGNED();
+    }
+    const u8 *const ptr = in->ptr;
+    in->ptr += len;
+    in->len -= len;
+
+    return ptr;
+}
+/// Returns a pointer to write `len` bytes to, and advances the internal state
+static inline u8 *IO_write_bytes(ostream_t *const out, size_t len) {
+    if (len > out->len) {
+        INP_SIZE();
+    }
+    u8 *const ptr = out->ptr;
+    out->ptr += len;
+    out->len -= len;
+
+    return ptr;
+}
+
+/// Advance the inner state by `len` bytes
+static inline void IO_advance_input(istream_t *const in, size_t len) {
+    if (len > in->len) {
+         INP_SIZE();
+    }
+    if (in->bit_offset != 0) {
+        UNALIGNED();
+    }
+
+    in->ptr += len;
+    in->len -= len;
+}
+
+/// Returns an `ostream_t` constructed from the given pointer and length
+static inline ostream_t IO_make_ostream(u8 *out, size_t len) {
+    return (ostream_t) { out, len };
+}
+
+/// Returns an `istream_t` constructed from the given pointer and length
+static inline istream_t IO_make_istream(const u8 *in, size_t len) {
+    return (istream_t) { in, 0, len };
+}
+
+/// Returns an `istream_t` with the same base as `in`, and length `len`
+/// Then, advance `in` to account for the consumed bytes
+/// `in` must be byte aligned
+static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len) {
+    if (len > in->len) {
+        INP_SIZE();
+    }
+    if (in->bit_offset != 0) {
+        UNALIGNED();
+    }
+    const istream_t sub = { in->ptr, in->bit_offset, len };
+
+    in->ptr += len;
+    in->len -= len;
+
+    return sub;
+}
+/******* END IO STREAM OPERATIONS *********************************************/
+
 /******* BITSTREAM OPERATIONS *************************************************/
 /// Read `num` bits (up to 64) from `src + offset`, where `offset` is in bits
 static inline u64 read_bits_LE(const u8 *src, const int num,
@@ -1676,28 +1627,29 @@ static inline void HUF_init_state(const HUF_dtable *const dtable,
     *state = STREAM_read_bits(src, bits, offset);
 }
 
-static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, u8 *dst,
-                                     const size_t dst_len, const u8 *src,
-                                     size_t src_len) {
-    const u8 *const dst_max = dst + dst_len;
-    const u8 *const odst = dst;
+static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
+                                     ostream_t *const out,
+                                     istream_t *const in) {
+    const size_t len = IO_istream_len(in);
+    if (len == 0) {
+        INP_SIZE();
+    }
+    const u8 *const src = IO_read_bytes(in, len);
 
     // To maintain similarity with FSE, start from the end
     // Find the last 1 bit
-    const int padding = 8 - log2inf(src[src_len - 1]);
+    const int padding = 8 - log2inf(src[len - 1]);
 
-    i64 offset = src_len * 8 - padding;
+    i64 offset = len * 8 - padding;
     u16 state;
 
     HUF_init_state(dtable, &state, src, &offset);
 
-    while (dst < dst_max && offset > -dtable->max_bits) {
+    size_t symbols_written = 0;
+    while (offset > -dtable->max_bits) {
         // Iterate over the stream, decoding one symbol at a time
-        *dst++ = HUF_decode_symbol(dtable, &state, src, &offset);
-    }
-    // If we stopped before consuming all the input, we didn't have enough space
-    if (dst == dst_max && offset > -dtable->max_bits) {
-        OUT_SIZE();
+        IO_write_byte(out, HUF_decode_symbol(dtable, &state, src, &offset));
+        symbols_written++;
     }
 
     // When all symbols have been decoded, the final state value shouldn't have
@@ -1709,50 +1661,30 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable, u8 *dst,
         CORRUPTION();
     }
 
-    return dst - odst;
+    return symbols_written;
 }
 
-static size_t HUF_decompress_4stream(const HUF_dtable *const dtable, u8 *dst,
-                                     const size_t dst_len, const u8 *const src,
-                                     const size_t src_len) {
-    if (src_len < 6) {
-        INP_SIZE();
-    }
+static size_t HUF_decompress_4stream(const HUF_dtable *const dtable,
+                                     ostream_t *const out, istream_t *const in) {
+    const size_t csize1 = IO_read_bits(in, 16);
+    const size_t csize2 = IO_read_bits(in, 16);
+    const size_t csize3 = IO_read_bits(in, 16);
 
-    const u8 *const src1 = src + 6;
-    const u8 *const src2 = src1 + read_bits_LE(src, 16, 0);
-    const u8 *const src3 = src2 + read_bits_LE(src, 16, 16);
-    const u8 *const src4 = src3 + read_bits_LE(src, 16, 32);
-    const u8 *const src_end = src + src_len;
-
-    // We can't test with all 4 sizes because the 4th size is a function of the
-    // other 3 and the provided length
-    if (src4 - src >= src_len) {
-        INP_SIZE();
-    }
-
-    const size_t segment_size = (dst_len + 3) / 4;
-    u8 *const dst1 = dst;
-    u8 *const dst2 = dst1 + segment_size;
-    u8 *const dst3 = dst2 + segment_size;
-    u8 *const dst4 = dst3 + segment_size;
-    u8 *const dst_end = dst + dst_len;
-
-    size_t total_out = 0;
+    istream_t in1 = IO_make_sub_istream(in, csize1);
+    istream_t in2 = IO_make_sub_istream(in, csize2);
+    istream_t in3 = IO_make_sub_istream(in, csize3);
+    istream_t in4 = IO_make_sub_istream(in, IO_istream_len(in));
 
+    size_t total_output = 0;
     // Decode each stream independently for simplicity
     // If we wanted to we could decode all 4 at the same time for speed,
     // utilizing more execution units
-    total_out += HUF_decompress_1stream(dtable, dst1, segment_size, src1,
-                                        src2 - src1);
-    total_out += HUF_decompress_1stream(dtable, dst2, segment_size, src2,
-                                        src3 - src2);
-    total_out += HUF_decompress_1stream(dtable, dst3, segment_size, src3,
-                                        src4 - src3);
-    total_out += HUF_decompress_1stream(dtable, dst4, dst_end - dst4, src4,
-                                        src_end - src4);
+    total_output += HUF_decompress_1stream(dtable, out, &in1);
+    total_output += HUF_decompress_1stream(dtable, out, &in2);
+    total_output += HUF_decompress_1stream(dtable, out, &in3);
+    total_output += HUF_decompress_1stream(dtable, out, &in4);
 
-    return total_out;
+    return total_output;
 }
 
 static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
@@ -1827,6 +1759,10 @@ static void HUF_init_dtable_usingweights(HUF_dtable *const table,
 
     u64 weight_sum = 0;
     for (int i = 0; i < num_symbs; i++) {
+        // Weights are in the same range as bit count
+        if (weights[i] > HUF_MAX_BITS) {
+            CORRUPTION();
+        }
         weight_sum += weights[i] > 0 ? (u64)1 << (weights[i] - 1) : 0;
     }
 
@@ -1913,20 +1849,17 @@ static inline void FSE_init_state(const FSE_dtable *const dtable,
 }
 
 static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
-                                          u8 *dst, const size_t dst_len,
-                                          const u8 *const src,
-                                          const size_t src_len) {
-    if (src_len == 0) {
+                                          ostream_t *const out,
+                                          istream_t *const in) {
+    const size_t len = IO_istream_len(in);
+    if (len == 0) {
         INP_SIZE();
     }
-
-    const u8 *const dst_max = dst + dst_len;
-    const u8 *const odst = dst;
+    const u8 *const src = IO_read_bytes(in, len);
 
     // Find the last 1 bit
-    const int padding = 8 - log2inf(src[src_len - 1]);
-
-    i64 offset = src_len * 8 - padding;
+    const int padding = 8 - log2inf(src[len - 1]);
+    i64 offset = len * 8 - padding;
 
     // The end of the stream contains the 2 states, in this order
     u16 state1, state2;
@@ -1936,30 +1869,28 @@ static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
     // Decode until we overflow the stream
     // Since we decode in reverse order, overflowing the stream is offset going
     // negative
+    size_t symbols_written = 0;
     while (1) {
-        if (dst > dst_max - 2) {
-            OUT_SIZE();
-        }
-        *dst++ = FSE_decode_symbol(dtable, &state1, src, &offset);
+        IO_write_byte(out, FSE_decode_symbol(dtable, &state1, src, &offset));
+        symbols_written++;
         if (offset < 0) {
             // There's still a symbol to decode in state2
-            *dst++ = FSE_peek_symbol(dtable, state2);
+            IO_write_byte(out, FSE_peek_symbol(dtable, state2));
+            symbols_written++;
             break;
         }
 
-        if (dst > dst_max - 2) {
-            OUT_SIZE();
-        }
-        *dst++ = FSE_decode_symbol(dtable, &state2, src, &offset);
+        IO_write_byte(out, FSE_decode_symbol(dtable, &state2, src, &offset));
+        symbols_written++;
         if (offset < 0) {
             // There's still a symbol to decode in state1
-            *dst++ = FSE_peek_symbol(dtable, state1);
+            IO_write_byte(out, FSE_peek_symbol(dtable, state1));
+            symbols_written++;
             break;
         }
     }
 
-    // Number of symbols read
-    return dst - odst;
+    return symbols_written;
 }
 
 static void FSE_init_dtable(FSE_dtable *const dtable,
@@ -2042,17 +1973,13 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
 
 /// Decode an FSE header as defined in the Zstandard format specification and
 /// use the decoded frequencies to initialize a decoding table.
-static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
-                                const size_t src_len,
+static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
                                 const int max_accuracy_log) {
     if (max_accuracy_log > FSE_MAX_ACCURACY_LOG) {
         ERROR("FSE accuracy too large");
     }
-    if (src_len < 1) {
-        INP_SIZE();
-    }
 
-    const int accuracy_log = 5 + read_bits_LE(src, 4, 0);
+    const int accuracy_log = 5 + IO_read_bits(in, 4);
     if (accuracy_log > max_accuracy_log) {
         ERROR("FSE accuracy too large");
     }
@@ -2062,14 +1989,11 @@ static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
     i16 frequencies[FSE_MAX_SYMBS];
 
     int symb = 0;
-    // Offset of 4 because 4 bits were already read in for accuracy
-    size_t offset = 4;
     while (remaining > 1 && symb < FSE_MAX_SYMBS) {
         // Log of the number of possible values we could read
         int bits = log2inf(remaining) + 1;
 
-        u16 val = read_bits_LE(src, bits, offset);
-        offset += bits;
+        u16 val = IO_read_bits(in, bits);
 
         // Try to mask out the lower bits to see if it qualifies for the "small
         // value" threshold
@@ -2077,7 +2001,7 @@ static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
         const u16 threshold = ((u16)1 << bits) - 1 - remaining;
 
         if ((val & lower_mask) < threshold) {
-            offset--;
+            IO_rewind_bits(in, 1);
             val = val & lower_mask;
         } else if (val > lower_mask) {
             val = val - threshold;
@@ -2093,22 +2017,21 @@ static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
         // Handle the special probability = 0 case
         if (proba == 0) {
             // Read the next two bits to see how many more 0s
-            int repeat = read_bits_LE(src, 2, offset);
-            offset += 2;
+            int repeat = IO_read_bits(in, 2);
 
             while (1) {
                 for (int i = 0; i < repeat && symb < FSE_MAX_SYMBS; i++) {
                     frequencies[symb++] = 0;
                 }
                 if (repeat == 3) {
-                    repeat = read_bits_LE(src, 2, offset);
-                    offset += 2;
+                    repeat = IO_read_bits(in, 2);
                 } else {
                     break;
                 }
             }
         }
     }
+    IO_align_stream(in);
 
     if (remaining != 1 || symb >= FSE_MAX_SYMBS) {
         CORRUPTION();
@@ -2116,8 +2039,6 @@ static size_t FSE_decode_header(FSE_dtable *const dtable, const u8 *const src,
 
     // Initialize the decoding table using the determined weights
     FSE_init_dtable(dtable, frequencies, symb, accuracy_log);
-
-    return (offset + 7) / 8;
 }
 
 static void FSE_init_dtable_rle(FSE_dtable *const dtable, const u8 symb) {

From f191be2fe6cc68254c5488d958df96b3d5eb85a8 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Fri, 3 Feb 2017 18:04:00 -0800
Subject: [PATCH 08/15] Inlined portions of specification for clarity

---
 contrib/educational_decoder/zstd_decompress.c | 370 +++++++++++++++---
 1 file changed, 309 insertions(+), 61 deletions(-)

diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index 8f28313e..b46bb487 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -386,6 +386,11 @@ size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
 
     istream_t in = {(const u8 *)src, 0, src_len};
     ostream_t out = {(u8 *)dst, dst_len};
+
+    // "A content compressed by Zstandard is transformed into a Zstandard frame.
+    // Multiple frames can be appended into a single file or stream. A frame is
+    // totally independent, has a defined beginning and end, and a set of
+    // parameters which tells the decoder how to decompress it."
     while (IO_istream_len(&in) > 0) {
         decode_frame(&out, &in, &parsed_dict);
     }
@@ -415,19 +420,43 @@ static void decode_frame(ostream_t *const out, istream_t *const in,
                          const dictionary_t *const dict) {
     const u32 magic_number = IO_read_bits(in, 32);
 
+    // Skippable frame
+    //
+    // "Magic_Number
+    //
+    // 4 Bytes, little-endian format. Value : 0x184D2A5?, which means any value
+    // from 0x184D2A50 to 0x184D2A5F. All 16 values are valid to identify a
+    // skippable frame."
     if ((magic_number & ~0xFU) == 0x184D2A50U) {
-        // Skippable frame
+        // "Skippable frames allow the insertion of user-defined data into a
+        // flow of concatenated frames. Its design is pretty straightforward,
+        // with the sole objective to allow the decoder to quickly skip over
+        // user-defined data and continue decoding.
+        //
+        // Skippable frames defined in this specification are compatible with
+        // LZ4 ones."
         const size_t frame_size = IO_read_bits(in, 32);
 
         // skip over frame
         IO_advance_input(in, frame_size);
-    } else if (magic_number == 0xFD2FB528U) {
+
+        return;
+    }
+
+    // Zstandard frame
+    //
+    // "Magic_Number
+    //
+    // 4 Bytes, little-endian format. Value : 0xFD2FB528"
+    if (magic_number == 0xFD2FB528U) {
         // ZSTD frame
         decode_data_frame(out, in, dict);
-    } else {
-        // not a real frame
-        ERROR("Invalid magic number");
+
+        return;
     }
+
+    // not a real frame
+    ERROR("Invalid magic number");
 }
 
 /// Decode a frame that contains compressed data.  Not all frames do as there
@@ -483,6 +512,17 @@ static void free_frame_context(frame_context_t *const context) {
 
 static void parse_frame_header(frame_header_t *const header,
                                istream_t *const in) {
+    // "The first header's byte is called the Frame_Header_Descriptor. It tells
+    // which other fields are present. Decoding this byte is enough to tell the
+    // size of Frame_Header.
+    //
+    // Bit number   Field name
+    // 7-6  Frame_Content_Size_flag
+    // 5    Single_Segment_flag
+    // 4    Unused_bit
+    // 3    Reserved_bit
+    // 2    Content_Checksum_flag
+    // 1-0  Dictionary_ID_flag"
     const u8 descriptor = IO_read_bits(in, 8);
 
     // decode frame header descriptor into flags
@@ -501,12 +541,18 @@ static void parse_frame_header(frame_header_t *const header,
 
     // decode window size
     if (!single_segment_flag) {
-        // Use the algorithm from the specification to compute window size
-        // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+        // "Provides guarantees on maximum back-reference distance that will be
+        // used within compressed data. This information is important for
+        // decoders to allocate enough memory.
+        //
+        // Bit numbers  7-3         2-0
+        // Field name   Exponent    Mantissa"
         u8 window_descriptor = IO_read_bits(in, 8);
         u8 exponent = window_descriptor >> 3;
         u8 mantissa = window_descriptor & 7;
 
+        // Use the algorithm from the specification to compute window size
+        // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
         size_t window_base = (size_t)1 << (10 + exponent);
         size_t window_add = (window_base / 8) * mantissa;
         header->window_size = window_base + window_add;
@@ -514,6 +560,10 @@ static void parse_frame_header(frame_header_t *const header,
 
     // decode dictionary id if it exists
     if (dictionary_id_flag) {
+        // "This is a variable size field, which contains the ID of the
+        // dictionary required to properly decode the frame. Note that this
+        // field is optional. When it's not present, it's up to the caller to
+        // make sure it uses the correct dictionary. Format is little-endian."
         const int bytes_array[] = {0, 1, 2, 4};
         const int bytes = bytes_array[dictionary_id_flag];
 
@@ -524,6 +574,11 @@ static void parse_frame_header(frame_header_t *const header,
 
     // decode frame content size if it exists
     if (single_segment_flag || frame_content_size_flag) {
+        // "This is the original (uncompressed) size. This information is
+        // optional. The Field_Size is provided according to value of
+        // Frame_Content_Size_flag. The Field_Size can be equal to 0 (not
+        // present), 1, 2, 4 or 8 bytes. Format is little-endian."
+        //
         // if frame_content_size_flag == 0 but single_segment_flag is set, we
         // still have a 1 byte field
         const int bytes_array[] = {1, 2, 4, 8};
@@ -531,6 +586,7 @@ static void parse_frame_header(frame_header_t *const header,
 
         header->frame_content_size = IO_read_bits(in, bytes * 8);
         if (bytes == 2) {
+            // "When Field_Size is 2, the offset of 256 is added."
             header->frame_content_size += 256;
         }
     } else {
@@ -538,9 +594,10 @@ static void parse_frame_header(frame_header_t *const header,
     }
 
     if (single_segment_flag) {
-        // in this case the effective window size is frame_content_size this
-        // impacts sequence decoding as we need to determine whether to fall
-        // back to the dictionary or not on large offsets
+        // "The Window_Descriptor byte is optional. It is absent when
+        // Single_Segment_flag is set. In this case, the maximum back-reference
+        // distance is the content size itself, which can be any value from 1 to
+        // 2^64-1 bytes (16 EB)."
         header->window_size = header->frame_content_size;
     }
 }
@@ -584,16 +641,31 @@ static void frame_context_apply_dict(frame_context_t *const ctx,
 /// Decompress the data from a frame block by block
 static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
                             istream_t *const in) {
+    // "A frame encapsulates one or multiple blocks. Each block can be
+    // compressed or not, and has a guaranteed maximum content size, which
+    // depends on frame parameters. Unlike frames, each block depends on
+    // previous blocks for proper decoding. However, each block can be
+    // decompressed without waiting for its successor, allowing streaming
+    // operations."
     int last_block = 0;
     do {
-        // Parse the block header
+        // "Last_Block
+        //
+        // The lowest bit signals if this block is the last one. Frame ends
+        // right after this block.
+        //
+        // Block_Type and Block_Size
+        //
+        // The next 2 bits represent the Block_Type, while the remaining 21 bits
+        // represent the Block_Size. Format is little-endian."
         last_block = IO_read_bits(in, 1);
         const int block_type = IO_read_bits(in, 2);
         const size_t block_len = IO_read_bits(in, 21);
 
         switch (block_type) {
         case 0: {
-            // Raw, uncompressed block
+            // "Raw_Block - this is an uncompressed block. Block_Size is the
+            // number of bytes to read and copy."
             const u8 *const read_ptr = IO_read_bytes(in, block_len);
             u8 *const write_ptr = IO_write_bytes(out, block_len);
             //
@@ -604,7 +676,9 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
             break;
         }
         case 1: {
-            // RLE block, repeat the first byte N times
+            // "RLE_Block - this is a single byte, repeated N times. In which
+            // case, Block_Size is the size to regenerate, while the
+            // "compressed" block is just 1 byte (the byte to repeat)."
             const u8 *const read_ptr = IO_read_bytes(in, 1);
             u8 *const write_ptr = IO_write_bytes(out, block_len);
 
@@ -615,14 +689,18 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
             break;
         }
         case 2: {
-            // Compressed block
+            // "Compressed_Block - this is a Zstandard compressed block,
+            // detailed in another section of this specification. Block_Size is
+            // the compressed size.
+
             // Create a sub-stream for the block
             istream_t block_stream = IO_make_sub_istream(in, block_len);
             decompress_block(ctx, out, &block_stream);
             break;
         }
         case 3:
-            // Reserved block type
+            // "Reserved - this is not a block. This value cannot be used with
+            // current version of this specification."
             CORRUPTION();
             break;
         default:
@@ -641,6 +719,12 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
 /******* BLOCK DECOMPRESSION **************************************************/
 static void decompress_block(frame_context_t *const ctx, ostream_t *const out,
                              istream_t *const in) {
+    // "A compressed block consists of 2 sections :
+    //
+    // Literals_Section
+    // Sequences_Section"
+
+
     // Part 1: decode the literals block
     u8 *literals = NULL;
     const size_t literals_size = decode_literals(ctx, in, &literals);
@@ -673,7 +757,22 @@ static void fse_decode_hufweights(ostream_t *weights, istream_t *const in,
 
 static size_t decode_literals(frame_context_t *const ctx, istream_t *const in,
                               u8 **const literals) {
-    // Decode literals header
+    // "Literals can be stored uncompressed or compressed using Huffman prefix
+    // codes. When compressed, an optional tree description can be present,
+    // followed by 1 or 4 streams."
+    //
+    // "Literals_Section_Header
+    //
+    // Header is in charge of describing how literals are packed. It's a
+    // byte-aligned variable-size bitfield, ranging from 1 to 5 bytes, using
+    // little-endian convention."
+    //
+    // "Literals_Block_Type
+    //
+    // This field uses 2 lowest bits of first byte, describing 4 different block
+    // types"
+    //
+    // size_format takes between 1 and 2 bits
     int block_type = IO_read_bits(in, 2);
     int size_format = IO_read_bits(in, 2);
 
@@ -726,13 +825,13 @@ static size_t decode_literals_simple(istream_t *const in, u8 **const literals,
 
     switch (block_type) {
     case 0: {
-        // Raw data
+        // "Raw_Literals_Block - Literals are stored uncompressed."
         const u8 *const read_ptr = IO_read_bytes(in, size);
         memcpy(*literals, read_ptr, size);
         break;
     }
     case 1: {
-        // Single repeated byte
+        // "RLE_Literals_Block - Literals consist of a single byte value repeated N times."
         const u8 *const read_ptr = IO_read_bytes(in, 1);
         memset(*literals, read_ptr[0], size);
         break;
@@ -796,6 +895,8 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
 
     if (block_type == 2) {
         // Decode provided Huffman table
+        // "This section is only present when Literals_Block_Type type is
+        // Compressed_Literals_Block (2)."
 
         HUF_free_dtable(&ctx->literals_dtable);
         decode_huf_table(&huf_stream, &ctx->literals_dtable);
@@ -824,22 +925,32 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
 static void decode_huf_table(istream_t *const in, HUF_dtable *const dtable) {
     const u8 header = IO_read_bits(in, 8);
 
+    // "All literal values from zero (included) to last present one (excluded)
+    // are represented by Weight with values from 0 to Max_Number_of_Bits."
+
+    // "This is a single byte value (0-255), which describes how to decode the list of weights."
     u8 weights[HUF_MAX_SYMBS];
     memset(weights, 0, sizeof(weights));
 
     int num_symbs;
 
     if (header >= 128) {
-        // Direct representation, read the weights out
+        // "This is a direct representation, where each Weight is written
+        // directly as a 4 bits field (0-15). The full representation occupies
+        // ((Number_of_Symbols+1)/2) bytes, meaning it uses a last full byte
+        // even if Number_of_Symbols is odd. Number_of_Symbols = headerByte -
+        // 127"
         num_symbs = header - 127;
         const size_t bytes = (num_symbs + 1) / 2;
 
         const u8 *const weight_src = IO_read_bytes(in, bytes);
 
         for (int i = 0; i < num_symbs; i++) {
-            // read_bits_LE isn't applicable here because the weights are order
-            // reversed within each byte
-            // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-header
+            // "They are encoded forward, 2
+            // weights to a byte with the first weight taking the top four bits
+            // and the second taking the bottom four (e.g. the following
+            // operations could be used to read the weights: Weight[0] =
+            // (Byte[0] >> 4), Weight[1] = (Byte[0] & 0xf), etc.)."
             if (i % 2 == 0) {
                 weights[i] = weight_src[i / 2] >> 4;
             } else {
@@ -864,7 +975,9 @@ static void fse_decode_hufweights(ostream_t *weights, istream_t *const in,
 
     FSE_dtable dtable;
 
-    // Construct the FSE table
+    // "An FSE bitstream starts by a header, describing probabilities
+    // distribution. It will create a Decoding Table. For a list of Huffman
+    // weights, maximum accuracy is 7 bits."
     FSE_decode_header(&dtable, in, MAX_ACCURACY_LOG);
 
     // Decode the weights
@@ -947,9 +1060,19 @@ static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
 
 static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
                                sequence_command_t **const sequences) {
+    // "A compressed block is a succession of sequences . A sequence is a
+    // literal copy command, followed by a match copy command. A literal copy
+    // command specifies a length. It is the number of bytes to be copied (or
+    // extracted) from the literal section. A match copy command specifies an
+    // offset and a length. The offset gives the position to copy from, which
+    // can be within a previous block."
+
     size_t num_sequences;
 
-    // Decode the sequence header and allocate space for the output
+    // "Number_of_Sequences
+    //
+    // This is a variable size field using between 1 and 3 bytes. Let's call its
+    // first byte byte0."
     u8 header = IO_read_bits(in, 8);
     if (header == 0) {
         // "There are no sequences. The sequence section stops there.
@@ -980,12 +1103,33 @@ static size_t decode_sequences(frame_context_t *const ctx, istream_t *in,
 static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
                                  sequence_command_t *const sequences,
                                  const size_t num_sequences) {
+    // "The Sequences_Section regroup all symbols required to decode commands.
+    // There are 3 symbol types : literals lengths, offsets and match lengths.
+    // They are encoded together, interleaved, in a single bitstream."
+
+    // "Symbol compression modes
+    //
+    // This is a single byte, defining the compression mode of each symbol
+    // type."
+    //
+    // Bit number : Field name
+    // 7-6        : Literals_Lengths_Mode
+    // 5-4        : Offsets_Mode
+    // 3-2        : Match_Lengths_Mode
+    // 1-0        : Reserved
     u8 compression_modes = IO_read_bits(in, 8);
 
     if ((compression_modes & 3) != 0) {
+        // Reserved bits set
         CORRUPTION();
     }
 
+    // "Following the header, up to 3 distribution tables can be described. When
+    // present, they are in this order :
+    //
+    // Literals lengths
+    // Offsets
+    // Match Lengths"
     // Update the tables we have stored in the context
     decode_seq_table(in, &ctx->ll_dtable, seq_literal_length,
                      (compression_modes >> 6) & 3);
@@ -1016,6 +1160,12 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
     const int padding = 8 - log2inf(src[len - 1]);
     i64 offset = len * 8 - padding;
 
+    // "The bitstream starts with initial state values, each using the required
+    // number of bits in their respective accuracy, decoded previously from
+    // their normalized distribution.
+    //
+    // It starts by Literals_Length_State, followed by Offset_State, and finally
+    // Match_Length_State."
     FSE_init_state(&state.ll_table, &state.ll_state, src, &offset);
     FSE_init_state(&state.of_table, &state.of_state, src, &offset);
     FSE_init_state(&state.ml_table, &state.ml_state, src, &offset);
@@ -1036,6 +1186,10 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
 static sequence_command_t decode_sequence(sequence_state_t *const state,
                                           const u8 *const src,
                                           i64 *const offset) {
+    // "Each symbol is a code in its own context, which specifies Baseline and
+    // Number_of_Bits to add. Codes are FSE compressed, and interleaved with raw
+    // additional bits in the same bitstream."
+
     // Decode symbols, but don't update states
     const u8 of_code = FSE_peek_symbol(&state->of_table, state->of_state);
     const u8 ll_code = FSE_peek_symbol(&state->ll_table, state->ll_state);
@@ -1049,18 +1203,24 @@ static sequence_command_t decode_sequence(sequence_state_t *const state,
 
     // Read the interleaved bits
     sequence_command_t seq;
-    // Offset computation works differently
+    // "Decoding starts by reading the Number_of_Bits required to decode Offset.
+    // It then does the same for Match_Length, and then for Literals_Length."
     seq.offset = ((u32)1 << of_code) + STREAM_read_bits(src, of_code, offset);
+
     seq.match_length =
         SEQ_MATCH_LENGTH_BASELINES[ml_code] +
         STREAM_read_bits(src, SEQ_MATCH_LENGTH_EXTRA_BITS[ml_code], offset);
+
     seq.literal_length =
         SEQ_LITERAL_LENGTH_BASELINES[ll_code] +
         STREAM_read_bits(src, SEQ_LITERAL_LENGTH_EXTRA_BITS[ll_code], offset);
 
+    // "If it is not the last sequence in the block, the next operation is to
+    // update states. Using the rules pre-calculated in the decoding tables,
+    // Literals_Length_State is updated, followed by Match_Length_State, and
+    // then Offset_State."
     // If the stream is complete don't read bits to update state
     if (*offset != 0) {
-        // Update state in the order specified in the specification
         FSE_update_state(&state->ll_table, &state->ll_state, src, offset);
         FSE_update_state(&state->ml_table, &state->ml_state, src, offset);
         FSE_update_state(&state->of_table, &state->of_state, src, offset);
@@ -1088,6 +1248,7 @@ static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
 
     switch (mode) {
     case seq_predefined: {
+        // "Predefined_Mode : uses a predefined distribution table."
         const i16 *distribution = default_distributions[type];
         const size_t symbs = default_distribution_lengths[type];
         const size_t accuracy_log = default_distribution_accuracies[type];
@@ -1096,15 +1257,20 @@ static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
         break;
     }
     case seq_rle: {
+        // "RLE_Mode : it's a single code, repeated Number_of_Sequences times."
         const u8 symb = IO_read_bits(in, 8);
         FSE_init_dtable_rle(table, symb);
         break;
     }
     case seq_fse: {
+        // "FSE_Compressed_Mode : standard FSE compression. A distribution table
+        // will be present "
         FSE_decode_header(table, in, max_accuracies[type]);
         break;
     }
     case seq_repeat:
+        // "Repeat_Mode : re-use distribution table from previous compressed
+        // block."
         // Nothing to do here, table will be unchanged
         break;
     default:
@@ -1322,8 +1488,8 @@ static void traverse_frame(const frame_header_t *const header, istream_t *const
 /******* END OUTPUT SIZE COUNTING *********************************************/
 
 /******* DICTIONARY PARSING ***************************************************/
-static void init_raw_content_dict(dictionary_t *const dict, const u8 *const src,
-                                  const size_t src_len);
+static void init_dictionary_content(dictionary_t *const dict,
+                                    istream_t *const in);
 
 static void parse_dictionary(dictionary_t *const dict, const u8 *src,
                              size_t src_len) {
@@ -1337,13 +1503,20 @@ static void parse_dictionary(dictionary_t *const dict, const u8 *src,
     const u32 magic_number = IO_read_bits(&in, 32);
     if (magic_number != 0xEC30A437) {
         // raw content dict
-        init_raw_content_dict(dict, src, src_len);
+        IO_rewind_bits(&in, 32);
+        init_dictionary_content(dict, &in);
         return;
     }
 
     dict->dictionary_id = IO_read_bits(&in, 32);
 
-    // Parse the provided entropy tables in order
+    // "Entropy_Tables : following the same format as the tables in compressed
+    // blocks. They are stored in following order : Huffman tables for literals,
+    // FSE table for offsets, FSE table for match lengths, and FSE table for
+    // literals lengths. It's finally followed by 3 offset values, populating
+    // recent offsets (instead of using {1,4,8}), stored in order, 4-bytes
+    // little-endian each, for a total of 12 bytes. Each recent offset must have
+    // a value < dictionary size."
     decode_huf_table(&in, &dict->literals_dtable);
     decode_seq_table(&in, &dict->of_dtable, seq_offset, seq_fse);
     decode_seq_table(&in, &dict->ml_dtable, seq_match_length, seq_fse);
@@ -1355,38 +1528,33 @@ static void parse_dictionary(dictionary_t *const dict, const u8 *src,
     dict->previous_offsets[2] = IO_read_bits(&in, 32);
 
     // Ensure the provided offsets aren't too large
+    // "Each recent offset must have a value < dictionary size."
     for (int i = 0; i < 3; i++) {
         if (dict->previous_offsets[i] > src_len) {
             ERROR("Dictionary corrupted");
         }
     }
 
-    // The rest is the content
-    dict->content_size = IO_istream_len(&in);
+    // "Content : The rest of the dictionary is its content. The content act as
+    // a "past" in front of data to compress or decompress, so it can be
+    // referenced in sequence commands."
+    init_dictionary_content(dict, &in);
+}
+
+static void init_dictionary_content(dictionary_t *const dict,
+                                    istream_t *const in) {
+    // Copy in the content
+    dict->content_size = IO_istream_len(in);
     dict->content = malloc(dict->content_size);
     if (!dict->content) {
         BAD_ALLOC();
     }
 
-    const u8 *const content = IO_read_bytes(&in, dict->content_size);
+    const u8 *const content = IO_read_bytes(in, dict->content_size);
 
     memcpy(dict->content, content, dict->content_size);
 }
 
-/// If parse_dictionary is given a raw content dictionary, it delegates here
-static void init_raw_content_dict(dictionary_t *const dict, const u8 *const src,
-                                  const size_t src_len) {
-    dict->dictionary_id = 0;
-    // Copy in the content
-    dict->content = malloc(src_len);
-    if (!dict->content) {
-        BAD_ALLOC();
-    }
-
-    dict->content_size = src_len;
-    memcpy(dict->content, src, src_len);
-}
-
 /// Free an allocated dictionary
 static void free_dictionary(dictionary_t *const dict) {
     HUF_free_dtable(&dict->literals_dtable);
@@ -1636,8 +1804,15 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
     }
     const u8 *const src = IO_read_bytes(in, len);
 
-    // To maintain similarity with FSE, start from the end
-    // Find the last 1 bit
+    // "Each bitstream must be read backward, that is starting from the end down
+    // to the beginning. Therefore it's necessary to know the size of each
+    // bitstream.
+    //
+    // It's also necessary to know exactly which bit is the latest. This is
+    // detected by a final bit flag : the highest bit of latest byte is a
+    // final-bit-flag. Consequently, a last byte of 0 is not possible. And the
+    // final-bit-flag itself is not part of the useful bitstream. Hence, the
+    // last byte contains between 0 and 7 useful bits."
     const int padding = 8 - log2inf(src[len - 1]);
 
     i64 offset = len * 8 - padding;
@@ -1651,6 +1826,10 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
         IO_write_byte(out, HUF_decode_symbol(dtable, &state, src, &offset));
         symbols_written++;
     }
+    // "The process continues up to reading the required number of symbols per
+    // stream. If a bitstream is not entirely and exactly consumed, hence
+    // reaching exactly its beginning position with all bits consumed, the
+    // decoding process is considered faulty."
 
     // When all symbols have been decoded, the final state value shouldn't have
     // any data from the stream, so it should have "read" dtable->max_bits from
@@ -1666,6 +1845,11 @@ static size_t HUF_decompress_1stream(const HUF_dtable *const dtable,
 
 static size_t HUF_decompress_4stream(const HUF_dtable *const dtable,
                                      ostream_t *const out, istream_t *const in) {
+    // "Compressed size is provided explicitly : in the 4-streams variant,
+    // bitstreams are preceded by 3 unsigned little-endian 16-bits values. Each
+    // value represents the compressed size of one stream, in order. The last
+    // stream size is deducted from total compressed size and from previously
+    // decoded stream sizes"
     const size_t csize1 = IO_read_bits(in, 16);
     const size_t csize2 = IO_read_bits(in, 16);
     const size_t csize3 = IO_read_bits(in, 16);
@@ -1719,6 +1903,10 @@ static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
         BAD_ALLOC();
     }
 
+    // "Symbols are sorted by Weight. Within same Weight, symbols keep natural
+    // order. Symbols with a Weight of zero are removed. Then, starting from
+    // lowest weight, prefix codes are distributed in order."
+
     u32 rank_idx[HUF_MAX_BITS + 1];
     // Initialize the starting codes for each rank (number of bits)
     rank_idx[max_bits] = 0;
@@ -1779,6 +1967,7 @@ static void HUF_init_dtable_usingweights(HUF_dtable *const table,
     const int last_weight = log2inf(left_over) + 1;
 
     for (int i = 0; i < num_symbs; i++) {
+        // "Number_of_Bits = Number_of_Bits ? Max_Number_of_Bits + 1 - Weight : 0"
         bits[i] = weights[i] > 0 ? (max_bits + 1 - weights[i]) : 0;
     }
     bits[num_symbs] =
@@ -1857,12 +2046,23 @@ static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
     }
     const u8 *const src = IO_read_bytes(in, len);
 
-    // Find the last 1 bit
+    // "Each bitstream must be read backward, that is starting from the end down
+    // to the beginning. Therefore it's necessary to know the size of each
+    // bitstream.
+    //
+    // It's also necessary to know exactly which bit is the latest. This is
+    // detected by a final bit flag : the highest bit of latest byte is a
+    // final-bit-flag. Consequently, a last byte of 0 is not possible. And the
+    // final-bit-flag itself is not part of the useful bitstream. Hence, the
+    // last byte contains between 0 and 7 useful bits."
     const int padding = 8 - log2inf(src[len - 1]);
     i64 offset = len * 8 - padding;
 
-    // The end of the stream contains the 2 states, in this order
     u16 state1, state2;
+    // "The first state (State1) encodes the even indexed symbols, and the
+    // second (State2) encodes the odd indexes. State1 is initialized first, and
+    // then State2, and they take turns decoding a single symbol and updating
+    // their state."
     FSE_init_state(dtable, &state1, src, &offset);
     FSE_init_state(dtable, &state2, src, &offset);
 
@@ -1871,6 +2071,11 @@ static size_t FSE_decompress_interleaved2(const FSE_dtable *const dtable,
     // negative
     size_t symbols_written = 0;
     while (1) {
+        // "The number of symbols to decode is determined by tracking bitStream
+        // overflow condition: If updating state after decoding a symbol would
+        // require more bits than remain in the stream, it is assumed the extra
+        // bits are 0. Then, the symbols for each of the final states are
+        // decoded and the process is complete."
         IO_write_byte(out, FSE_decode_symbol(dtable, &state1, src, &offset));
         symbols_written++;
         if (offset < 0) {
@@ -1920,6 +2125,10 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
     // which can be larger than a byte can store
     u16 state_desc[FSE_MAX_SYMBS];
 
+    // "Symbols are scanned in their natural order for "less than 1"
+    // probabilities. Symbols with this probability are being attributed a
+    // single cell, starting from the end of the table. These symbols define a
+    // full state reset, reading Accuracy_Log bits."
     int high_threshold = size;
     for (int s = 0; s < num_symbs; s++) {
         // Scan for low probability symbols to put at the top
@@ -1929,6 +2138,9 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
         }
     }
 
+    // "All remaining symbols are sorted in their natural order. Starting from
+    // symbol 0 and table position 0, each symbol gets attributed as many cells
+    // as its probability. Cell allocation is spreaded, not linear."
     // Place the rest in the table
     const u16 step = (size >> 1) + (size >> 3) + 3;
     const u16 mask = size - 1;
@@ -1943,11 +2155,12 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
         for (int i = 0; i < norm_freqs[s]; i++) {
             // Give `norm_freqs[s]` states to symbol s
             dtable->symbols[pos] = s;
+            // "A position is skipped if already occupied, typically by a "less
+            // than 1" probability symbol."
             do {
                 pos = (pos + step) & mask;
             } while (pos >=
-                     high_threshold); // Make sure we don't occupy a spot taken
-                                      // by the low prob symbols
+                     high_threshold);
             // Note: no other collision checking is necessary as `step` is
             // coprime to `size`, so the cycle will visit each position exactly
             // once
@@ -1975,30 +2188,53 @@ static void FSE_init_dtable(FSE_dtable *const dtable,
 /// use the decoded frequencies to initialize a decoding table.
 static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
                                 const int max_accuracy_log) {
+    // "An FSE distribution table describes the probabilities of all symbols
+    // from 0 to the last present one (included) on a normalized scale of 1 <<
+    // Accuracy_Log .
+    //
+    // It's a bitstream which is read forward, in little-endian fashion. It's
+    // not necessary to know its exact size, since it will be discovered and
+    // reported by the decoding process.
     if (max_accuracy_log > FSE_MAX_ACCURACY_LOG) {
         ERROR("FSE accuracy too large");
     }
 
+    // The bitstream starts by reporting on which scale it operates.
+    // Accuracy_Log = low4bits + 5. Note that maximum Accuracy_Log for literal
+    // and match lengths is 9, and for offsets is 8. Higher values are
+    // considered errors."
     const int accuracy_log = 5 + IO_read_bits(in, 4);
     if (accuracy_log > max_accuracy_log) {
         ERROR("FSE accuracy too large");
     }
 
-    // The +1 facilitates the `-1` probabilities
-    i32 remaining = (1 << accuracy_log) + 1;
+    // "Then follows each symbol value, from 0 to last present one. The number
+    // of bits used by each field is variable. It depends on :
+    //
+    // Remaining probabilities + 1 : example : Presuming an Accuracy_Log of 8,
+    // and presuming 100 probabilities points have already been distributed, the
+    // decoder may read any value from 0 to 255 - 100 + 1 == 156 (inclusive).
+    // Therefore, it must read log2sup(156) == 8 bits.
+    //
+    // Value decoded : small values use 1 less bit : example : Presuming values
+    // from 0 to 156 (inclusive) are possible, 255-156 = 99 values are remaining
+    // in an 8-bits field. They are used this way : first 99 values (hence from
+    // 0 to 98) use only 7 bits, values from 99 to 156 use 8 bits. "
+
+    i32 remaining = 1 << accuracy_log;
     i16 frequencies[FSE_MAX_SYMBS];
 
     int symb = 0;
-    while (remaining > 1 && symb < FSE_MAX_SYMBS) {
+    while (remaining > 0 && symb < FSE_MAX_SYMBS) {
         // Log of the number of possible values we could read
-        int bits = log2inf(remaining) + 1;
+        int bits = log2inf(remaining + 1) + 1;
 
         u16 val = IO_read_bits(in, bits);
 
         // Try to mask out the lower bits to see if it qualifies for the "small
         // value" threshold
         const u16 lower_mask = ((u16)1 << (bits - 1)) - 1;
-        const u16 threshold = ((u16)1 << bits) - 1 - remaining;
+        const u16 threshold = ((u16)1 << bits) - 1 - (remaining + 1);
 
         if ((val & lower_mask) < threshold) {
             IO_rewind_bits(in, 1);
@@ -2007,14 +2243,23 @@ static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
             val = val - threshold;
         }
 
+        // "Probability is obtained from Value decoded by following formula :
+        // Proba = value - 1"
         const i16 proba = (i16)val - 1;
-        // A value of -1 is possible, and has special meaning
+
+        // "It means value 0 becomes negative probability -1. -1 is a special
+        // probability, which means "less than 1". Its effect on distribution
+        // table is described in next paragraph. For the purpose of calculating
+        // cumulated distribution, it counts as one."
         remaining -= proba < 0 ? -proba : proba;
 
         frequencies[symb] = proba;
         symb++;
 
-        // Handle the special probability = 0 case
+        // "When a symbol has a probability of zero, it is followed by a 2-bits
+        // repeat flag. This repeat flag tells how many probabilities of zeroes
+        // follow the current one. It provides a number ranging from 0 to 3. If
+        // it is a 3, another 2-bits repeat flag follows, and so on."
         if (proba == 0) {
             // Read the next two bits to see how many more 0s
             int repeat = IO_read_bits(in, 2);
@@ -2033,7 +2278,10 @@ static void FSE_decode_header(FSE_dtable *const dtable, istream_t *const in,
     }
     IO_align_stream(in);
 
-    if (remaining != 1 || symb >= FSE_MAX_SYMBS) {
+    // "When last symbol reaches cumulated total of 1 << Accuracy_Log, decoding
+    // is complete. If the last symbol makes cumulated total go above 1 <<
+    // Accuracy_Log, distribution is considered corrupted."
+    if (remaining != 0 || symb >= FSE_MAX_SYMBS) {
         CORRUPTION();
     }
 

From d44d363ec176734191027779a435d387ba9f1d37 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 6 Feb 2017 10:01:06 -0800
Subject: [PATCH 09/15] changed download URL for github_users sample set

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b5e16dff..6336d900 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ To solve this situation, Zstd offers a __training mode__, which can be used to t
 Training Zstandard is achieved by provide it with a few samples (one file per sample). The result of this training is stored in a file called "dictionary", which must be loaded before compression and decompression.
 Using this dictionary, the compression ratio achievable on small data improves dramatically.
 
-The following example uses the `github-users` [sample set](https://www.dropbox.com/s/mnktkomhkjbf1i2/github_users.tar.zst?dl=0), created from [github public API](https://developer.github.com/v3/users/#get-all-users).
+The following example uses the `github-users` [sample set](https://github.com/facebook/zstd/releases/tag/v1.1.3), created from [github public API](https://developer.github.com/v3/users/#get-all-users).
 It consists of roughly 10K records weighting about 1KB each.
 
 Compression Ratio | Compression Speed | Decompression Speed

From 2cb8ee878437fb627bbee838ad8d125dafd84285 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Mon, 6 Feb 2017 11:32:13 -0800
Subject: [PATCH 10/15] Change zlib include to be a system include

---
 programs/fileio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programs/fileio.c b/programs/fileio.c
index a9e1574a..087bf950 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -39,7 +39,7 @@
 #  include "zstdmt_compress.h"
 #endif
 #ifdef ZSTD_GZDECOMPRESS
-#  include "zlib.h"
+#  include <zlib.h>
 #  if !defined(z_const)
 #    define z_const
 #  endif

From 7e3fc73795064bdbb49f59d7af1657b994c8b057 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Mon, 6 Feb 2017 11:54:31 -0800
Subject: [PATCH 11/15] Ensure <zlib.h> can be included in HAVE_ZLIB test

---
 programs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programs/Makefile b/programs/Makefile
index 599bef69..ae798c2a 100644
--- a/programs/Makefile
+++ b/programs/Makefile
@@ -67,7 +67,7 @@ endif
 
 # zlib detection
 VOID = /dev/null
-HAVE_ZLIB := $(shell echo "int main(){}" | $(CC) -o $(VOID) -x c - -lz 2> $(VOID) && echo 1 || echo 0)
+HAVE_ZLIB := $(shell echo "\#include <zlib.h>\nint main(){}" | $(CC) -o $(VOID) -x c - -lz 2> $(VOID) && echo 1 || echo 0)
 ifeq ($(HAVE_ZLIB), 1)
 ZLIBCPP = -DZSTD_GZDECOMPRESS
 ZLIBLD = -lz

From 816edeb9c244dcd8007433bb2bb1fb69e3ca0dc1 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Mon, 6 Feb 2017 17:39:54 -0800
Subject: [PATCH 12/15] corrected contributor's name

---
 NEWS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index 4f746305..a8a2ade2 100644
--- a/NEWS
+++ b/NEWS
@@ -12,7 +12,7 @@ API : new : ZDICT_finalizeDictionary()
 API : fix : ZSTD_initCStream_usingCDict() properly writes dictID into frame header, by Gregory Szorc (#511)
 API : fix : all symbols properly exposed in libzstd, by Nick Terrell
 build : support for Solaris target, by Przemyslaw Skibinski
-doc : clarified specification, by Andrew Purcell
+doc : clarified specification, by Sean Purcell
 
 v1.1.2
 API : streaming : decompression : changed : automatic implicit reset when chain-decoding new frames without init

From 71c5263c00e7b18e16ae394b23cef18487cf83fd Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Tue, 7 Feb 2017 11:35:07 -0800
Subject: [PATCH 13/15] Attribute cover dictionary code

---
 lib/dictBuilder/cover.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index c5b606db..1ced645b 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -7,6 +7,16 @@
  * of patent rights can be found in the PATENTS file in the same directory.
  */
 
+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
 /*-*************************************
 *  Dependencies
 ***************************************/
@@ -621,13 +631,6 @@ static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
   return zdictParams;
 }
 
-/**
- * Constructs a dictionary using a heuristic based on the following paper:
- *
- * Liao, Petri, Moffat, Wirth
- * Effective Construction of Relative Lempel-Ziv Dictionaries
- * Published in WWW 2016.
- */
 ZDICTLIB_API size_t COVER_trainFromBuffer(
     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
     const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {

From 00ea51f8066d5b624d2c221ee141b9ea593b9f1e Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 7 Feb 2017 12:05:28 -0800
Subject: [PATCH 14/15] completed NEWS for v1.1.3

---
 NEWS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS b/NEWS
index a8a2ade2..24860c95 100644
--- a/NEWS
+++ b/NEWS
@@ -1,7 +1,7 @@
 v1.1.3
 cli : zstd can decompress .gz files (can be disabled with `make zstd-nogz` or `make HAVE_ZLIB=0`)
 cli : new : experimental target `make zstdmt`, with multi-threading support
-cli : new : improved dictionary builder "cover" (experimental), by Nick Terrell
+cli : new : improved dictionary builder "cover" (experimental), by Nick Terrell, based on prior work by Giuseppe Ottaviano.
 cli : new : advanced commands for detailed parameters, by Przemyslaw Skibinski
 cli : fix zstdless on Mac OS-X, by Andrew Janke
 cli : fix #232 "compress non-files"

From eb52dbd4fe052b4786250635877fadeebfd127f5 Mon Sep 17 00:00:00 2001
From: Sean Purcell <me@seanp.xyz>
Date: Tue, 7 Feb 2017 14:44:11 -0800
Subject: [PATCH 15/15] Minor changes to educational decoder

---
 contrib/educational_decoder/zstd_decompress.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/contrib/educational_decoder/zstd_decompress.c b/contrib/educational_decoder/zstd_decompress.c
index b46bb487..85625598 100644
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
@@ -1258,7 +1258,7 @@ static void decode_seq_table(istream_t *const in, FSE_dtable *const table,
     }
     case seq_rle: {
         // "RLE_Mode : it's a single code, repeated Number_of_Sequences times."
-        const u8 symb = IO_read_bits(in, 8);
+        const u8 symb = IO_read_bytes(in, 1)[0];
         FSE_init_dtable_rle(table, symb);
         break;
     }
@@ -1572,8 +1572,8 @@ static void free_dictionary(dictionary_t *const dict) {
 #define UNALIGNED() ERROR("Attempting to operate on a non-byte aligned stream")
 /// Reads `num` bits from a bitstream, and updates the internal offset
 static inline u64 IO_read_bits(istream_t *const in, const int num) {
-    if (num > 64) {
-        return -1;
+    if (num > 64 || num <= 0) {
+        ERROR("Attempt to read an invalid number of bits");
     }
 
     const size_t bytes = (num + in->bit_offset + 7) / 8;
@@ -1710,7 +1710,7 @@ static inline istream_t IO_make_sub_istream(istream_t *const in, size_t len) {
 static inline u64 read_bits_LE(const u8 *src, const int num,
                                const size_t offset) {
     if (num > 64) {
-        return -1;
+        ERROR("Attempt to read an invalid number of bits");
     }
 
     // Skip over bytes that aren't in range
@@ -1871,6 +1871,11 @@ static size_t HUF_decompress_4stream(const HUF_dtable *const dtable,
     return total_output;
 }
 
+/// Initializes a Huffman table using canonical Huffman codes
+/// For more explanation on canonical Huffman codes see
+/// http://www.cs.uofs.edu/~mccloske/courses/cmps340/huff_canonical_dec2015.html
+/// Codes within a level are allocated in symbol order (i.e. smaller symbols get
+/// earlier codes)
 static void HUF_init_dtable(HUF_dtable *const table, const u8 *const bits,
                             const int num_symbs) {
     memset(table, 0, sizeof(HUF_dtable));
@@ -2004,6 +2009,9 @@ static void HUF_copy_dtable(HUF_dtable *const dst,
 /******* END HUFFMAN PRIMITIVES ***********************************************/
 
 /******* FSE PRIMITIVES *******************************************************/
+/// For more description of FSE see
+/// https://github.com/Cyan4973/FiniteStateEntropy/
+
 /// Allow a symbol to be decoded without updating state
 static inline u8 FSE_peek_symbol(const FSE_dtable *const dtable,
                                  const u16 state) {