Merge remote-tracking branch 'refs/remotes/facebook/dev' into dev11

2017-02-08 13:49:35 +01:00 · 2017-02-08 13:49:35 +01:00 · cdf5a7bd9f
parent 40580ff669 60259eb9a0
commit cdf5a7bd9f
9 changed files with 2515 additions and 12 deletions
--- a/4
+++ b/4
@ -1,7 +1,7 @@
 v1.1.3
 cli : zstd can decompress .gz files (can be disabled with `make zstd-nogz` or `make HAVE_ZLIB=0`)
 cli : new : experimental target `make zstdmt`, with multi-threading support
-cli : new : improved dictionary builder "cover" (experimental), by Nick Terrell
+cli : new : improved dictionary builder "cover" (experimental), by Nick Terrell, based on prior work by Giuseppe Ottaviano.
 cli : new : advanced commands for detailed parameters, by Przemyslaw Skibinski
 cli : fix zstdless on Mac OS-X, by Andrew Janke
 cli : fix #232 "compress non-files"
@ -12,7 +12,7 @@ API : new : ZDICT_finalizeDictionary()
 API : fix : ZSTD_initCStream_usingCDict() properly writes dictID into frame header, by Gregory Szorc (#511)
 API : fix : all symbols properly exposed in libzstd, by Nick Terrell
 build : support for Solaris target, by Przemyslaw Skibinski
-doc : clarified specification, by Andrew Purcell
+doc : clarified specification, by Sean Purcell

 v1.1.2
 API : streaming : decompression : changed : automatic implicit reset when chain-decoding new frames without init
--- a/README.md
+++ b/README.md
@ -55,7 +55,7 @@ To solve this situation, Zstd offers a __training mode__, which can be used to t
 Training Zstandard is achieved by provide it with a few samples (one file per sample). The result of this training is stored in a file called "dictionary", which must be loaded before compression and decompression.
 Using this dictionary, the compression ratio achievable on small data improves dramatically.

-The following example uses the `github-users` [sample set](https://www.dropbox.com/s/mnktkomhkjbf1i2/github_users.tar.zst?dl=0), created from [github public API](https://developer.github.com/v3/users/#get-all-users).
+The following example uses the `github-users` [sample set](https://github.com/facebook/zstd/releases/tag/v1.1.3), created from [github public API](https://developer.github.com/v3/users/#get-all-users).
 It consists of roughly 10K records weighting about 1KB each.

 Compression Ratio | Compression Speed | Decompression Speed
--- a/contrib/educational_decoder/README.md
+++ b/contrib/educational_decoder/README.md
@ -0,0 +1,19 @@
+Educational Decoder
+===================
+
+`zstd_decompress.c` is a self-contained implementation in C99 of a decoder,
+according to the [Zstandard format specification].
+While it does not implement as many features as the reference decoder,
+such as the streaming API or content checksums, it is written to be easy to
+follow and understand, to help understand how the Zstandard format works.
+It's laid out to match the [format specification],
+so it can be used to understand how complex segments could be implemented.
+It also contains implementations of Huffman and FSE table decoding.
+
+[Zstandard format specification]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
+[format specification]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
+
+`harness.c` provides a simple test harness around the decoder:
+
+    harness <input-file> <output-file> [dictionary]
+
--- a/contrib/educational_decoder/harness.c
+++ b/contrib/educational_decoder/harness.c
@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "zstd_decompress.h"
+
+typedef unsigned char u8;
+
+// If the data doesn't have decompressed size with it, fallback on assuming the
+// compression ratio is at most 16
+#define MAX_COMPRESSION_RATIO (16)
+
+// Protect against allocating too much memory for output
+#define MAX_OUTPUT_SIZE ((size_t)1024 * 1024 * 1024)
+
+u8 *input;
+u8 *output;
+u8 *dict;
+
+size_t read_file(const char *path, u8 **ptr) {
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "failed to open file %s\n", path);
+        exit(1);
+    }
+
+    fseek(f, 0L, SEEK_END);
+    size_t size = ftell(f);
+    rewind(f);
+
+    *ptr = malloc(size);
+    if (!ptr) {
+        fprintf(stderr, "failed to allocate memory to hold %s\n", path);
+        exit(1);
+    }
+
+    size_t pos = 0;
+    while (!feof(f)) {
+        size_t read = fread(&(*ptr)[pos], 1, size, f);
+        if (ferror(f)) {
+            fprintf(stderr, "error while reading file %s\n", path);
+            exit(1);
+        }
+        pos += read;
+    }
+
+    fclose(f);
+
+    return pos;
+}
+
+void write_file(const char *path, const u8 *ptr, size_t size) {
+    FILE *f = fopen(path, "wb");
+
+    size_t written = 0;
+    while (written < size) {
+        written += fwrite(&ptr[written], 1, size, f);
+        if (ferror(f)) {
+            fprintf(stderr, "error while writing file %s\n", path);
+            exit(1);
+        }
+    }
+
+    fclose(f);
+}
+
+int main(int argc, char **argv) {
+    if (argc < 3) {
+        fprintf(stderr, "usage: %s <file.zst> <out_path> [dictionary]\n",
+                argv[0]);
+
+        return 1;
+    }
+
+    size_t input_size = read_file(argv[1], &input);
+    size_t dict_size = 0;
+    if (argc >= 4) {
+        dict_size = read_file(argv[3], &dict);
+    }
+
+    size_t decompressed_size = ZSTD_get_decompressed_size(input, input_size);
+    if (decompressed_size == -1) {
+        decompressed_size = MAX_COMPRESSION_RATIO * input_size;
+        fprintf(stderr, "WARNING: Compressed data does not contain "
+                        "decompressed size, going to assume the compression "
+                        "ratio is at most %d (decompressed size of at most "
+                        "%zu)\n",
+                MAX_COMPRESSION_RATIO, decompressed_size);
+    }
+    if (decompressed_size > MAX_OUTPUT_SIZE) {
+        fprintf(stderr,
+                "Required output size too large for this implementation\n");
+        return 1;
+    }
+    output = malloc(decompressed_size);
+    if (!output) {
+        fprintf(stderr, "failed to allocate memory\n");
+        return 1;
+    }
+
+    size_t decompressed =
+        ZSTD_decompress_with_dict(output, decompressed_size,
+                                  input, input_size, dict, dict_size);
+
+    write_file(argv[2], output, decompressed);
+
+    free(input);
+    free(output);
+    free(dict);
+    input = output = dict = NULL;
+}
+
--- a/contrib/educational_decoder/zstd_decompress.c
+++ b/contrib/educational_decoder/zstd_decompress.c
--- a/contrib/educational_decoder/zstd_decompress.h
+++ b/contrib/educational_decoder/zstd_decompress.h
@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+size_t ZSTD_decompress(void *const dst, const size_t dst_len,
+                       const void *const src, const size_t src_len);
+size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
+                                 const void *const src, const size_t src_len,
+                                 const void *const dict, const size_t dict_len);
+size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
+
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -7,6 +7,16 @@
 * of patent rights can be found in the PATENTS file in the same directory.
 */

+/* *****************************************************************************
+ * Constructs a dictionary using a heuristic based on the following paper:
+ *
+ * Liao, Petri, Moffat, Wirth
+ * Effective Construction of Relative Lempel-Ziv Dictionaries
+ * Published in WWW 2016.
+ *
+ * Adapted from code originally written by @ot (Giuseppe Ottaviano).
+ ******************************************************************************/
+
 /*-*************************************
 *  Dependencies
 ***************************************/
@ -621,13 +631,6 @@ static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
  return zdictParams;
 }

-/**
- * Constructs a dictionary using a heuristic based on the following paper:
- *
- * Liao, Petri, Moffat, Wirth
- * Effective Construction of Relative Lempel-Ziv Dictionaries
- * Published in WWW 2016.
- */
 ZDICTLIB_API size_t COVER_trainFromBuffer(
    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
    const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {
--- a/programs/Makefile
+++ b/programs/Makefile
@ -67,7 +67,7 @@ endif

 # zlib detection
 VOID = /dev/null
-HAVE_ZLIB := $(shell echo "int main(){}" | $(CC) -o $(VOID) -x c - -lz 2> $(VOID) && echo 1 || echo 0)
+HAVE_ZLIB := $(shell echo "\#include <zlib.h>\nint main(){}" | $(CC) -o $(VOID) -x c - -lz 2> $(VOID) && echo 1 || echo 0)
 ifeq ($(HAVE_ZLIB), 1)
 ZLIBCPP = -DZSTD_GZDECOMPRESS
 ZLIBLD = -lz
--- a/programs/fileio.c
+++ b/programs/fileio.c
@ -39,7 +39,7 @@
 #  include "zstdmt_compress.h"
 #endif
 #ifdef ZSTD_GZDECOMPRESS
-#  include "zlib.h"
+#  include <zlib.h>
 #  if !defined(z_const)
 #    define z_const
 #  endif