From a23a3b95f9c00ecf52216bd7fe768e41eac4e269 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 16:05:14 -0700
Subject: [PATCH 01/13] Add random dictionary builder

---
 contrib/randomDictBuilder/Makefile  |  48 +++
 contrib/randomDictBuilder/README.md |  13 +
 contrib/randomDictBuilder/main.c    | 125 ++++++++
 contrib/randomDictBuilder/random.c  | 455 ++++++++++++++++++++++++++++
 contrib/randomDictBuilder/random.h  |  53 ++++
 contrib/randomDictBuilder/test.sh   |  14 +
 6 files changed, 708 insertions(+)
 create mode 100644 contrib/randomDictBuilder/Makefile
 create mode 100644 contrib/randomDictBuilder/README.md
 create mode 100644 contrib/randomDictBuilder/main.c
 create mode 100644 contrib/randomDictBuilder/random.c
 create mode 100644 contrib/randomDictBuilder/random.h
 create mode 100644 contrib/randomDictBuilder/test.sh

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
new file mode 100644
index 00000000..a2aade23
--- /dev/null
+++ b/contrib/randomDictBuilder/Makefile
@@ -0,0 +1,48 @@
+PROGRAM_FILES := ../../programs/fileio.c
+
+TEST_INPUT := ../../lib
+TEST_OUTPUT := randomDict
+ARG :=
+
+all: main testrun test clean
+
+run: main rand clean
+
+.PHONY: rand
+rand:
+	echo "Building a random dictionary with given arguments"
+	./main $(ARG)
+
+
+main: random.o main.o libzstd.a
+	gcc random.o main.o libzstd.a -o main
+
+main.o: main.c
+	gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h
+
+random.o: $(PROGRAM_FILES) random.c
+	gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c
+
+libzstd.a:
+	$(MAKE) -C ../../lib libzstd.a
+	mv ../../lib/libzstd.a .
+
+.PHONY: testrun
+testrun: main
+	echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) "
+	./main in=$(TEST_INPUT) out=$(TEST_OUTPUT)
+	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
+	rm -f $(TEST_OUTPUT)
+
+.PHONY: test
+test: test.sh
+	sh test.sh
+	echo "Finish running test.sh"
+
+.PHONY: clean
+clean:
+	rm -f libzstd.a main
+	rm -f ../../lib/*/*.o
+	rm -f ../../programs/*.o
+	rm -f *.o
+	echo "Cleaning is completed"
diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
new file mode 100644
index 00000000..cadffdf2
--- /dev/null
+++ b/contrib/randomDictBuilder/README.md
@@ -0,0 +1,13 @@
+Random Dictionary Builder
+
+### Permitted Arguments:
+Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required
+Output Dictionary (out=dictName): if not provided, default to defaultDict
+Dictionary ID (dictID=#): positive number, if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB
+Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200
+Compression Level (c=#): positive number, if not provided, default to 3
+
+### Examples:
+make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
+make run ARG="in=../../lib/dictBuilder in=../../lib/compress"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
new file mode 100644
index 00000000..15eb5c44
--- /dev/null
+++ b/contrib/randomDictBuilder/main.c
@@ -0,0 +1,125 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "random.h"
+#include "util.h"
+
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const unsigned g_defaultMaxDictSize = 110 KB;
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_INPUTFILE ""
+#define DEFAULT_k 200
+#define DEFAULT_OUTPUTFILE "defaultDict"
+#define DEFAULT_DICTID 0
+
+
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
+{
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
+
+int main(int argCount, const char* argv[])
+{
+  int displayLevel = 2;
+  const char* programName = argv[0];
+  int operationResult = 0;
+
+  unsigned cLevel = DEFAULT_CLEVEL;
+  char* inputFile = DEFAULT_INPUTFILE;
+  unsigned k = DEFAULT_k;
+  char* outputFile = DEFAULT_OUTPUTFILE;
+  unsigned dictID = DEFAULT_DICTID;
+  unsigned maxDictSize = g_defaultMaxDictSize;
+
+  const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
+  unsigned filenameIdx = 0;
+
+  for (int i = 1; i < argCount; i++) {
+    const char* argument = argv[i];
+    if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "in=")) {
+      inputFile = malloc(strlen(argument) + 1);
+      strcpy(inputFile, argument);
+      filenameTable[filenameIdx] = inputFile;
+      filenameIdx++;
+      continue;
+    }
+    if (longCommandWArg(&argument, "out=")) {
+      outputFile = malloc(strlen(argument) + 1);
+      strcpy(outputFile, argument);
+      continue;
+    }
+    DISPLAYLEVEL(1, "Incorrect parameters\n");
+    operationResult = 1;
+    return operationResult;
+  }
+
+
+  char* fileNamesBuf = NULL;
+  unsigned fileNamesNb = filenameIdx;
+  int followLinks = 0;
+  const char** extendedFileList = NULL;
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
+  if (extendedFileList) {
+      unsigned u;
+      for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
+      free((void*)filenameTable);
+      filenameTable = extendedFileList;
+      filenameIdx = fileNamesNb;
+  }
+
+  size_t blockSize = 0;
+
+  ZDICT_random_params_t params;
+  ZDICT_params_t zParams;
+  zParams.compressionLevel = cLevel;
+  zParams.notificationLevel = displayLevel;
+  zParams.dictID = dictID;
+  params.zParams = zParams;
+  params.k = k;
+
+  operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, &params);
+  return operationResult;
+}
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
new file mode 100644
index 00000000..a59427ba
--- /dev/null
+++ b/contrib/randomDictBuilder/random.c
@@ -0,0 +1,455 @@
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdio.h>            /* fprintf */
+#include <stdlib.h>           /* malloc, free, qsort */
+#include <string.h>           /* memset */
+#include <time.h>             /* clock */
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "random.h"
+#include "platform.h"         /* Large Files support */
+#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
+
+/*-*************************************
+*  Constants
+***************************************/
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+#define DEFAULT_K 200
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
+                              size_t* sampleSizes, unsigned sstSize,
+                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
+                              unsigned displayLevel)
+{
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem)
+{
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+static void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+{
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+
+
+
+
+/* ********************************************************
+*  Random Dictionary Builder
+**********************************************************/
+/**
+ * Returns the sum of the sample sizes.
+ */
+static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
+  size_t sum = 0;
+  unsigned i;
+  for (i = 0; i < nbSamples; ++i) {
+    sum += samplesSizes[i];
+  }
+  return sum;
+}
+
+
+/**
+ * Selects a random segment from totalSamplesSize - k + 1 possible segments
+ */
+static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx,
+                                            ZDICT_random_params_t parameters) {
+    const U32 k = parameters.k;
+    RANDOM_segment_t segment;
+    unsigned index;
+
+    /* Seed random number generator */
+    srand((unsigned)time(NULL));
+    /* Randomly generate a number from 0 to sampleSizes - k */
+    index = rand()%(ctx->totalSamplesSize - k + 1);
+
+    /* inclusive */
+    segment.begin = index;
+    segment.end = index + k - 1;
+
+    return segment;
+}
+
+
+/**
+ * Check the validity of the parameters.
+ * Returns non-zero if the parameters are valid and 0 otherwise.
+ */
+static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) {
+    /* k is a required parameter */
+    if (parameters.k == 0) {
+      return 0;
+    }
+    /* k <= maxDictSize */
+    if (parameters.k > maxDictSize) {
+      return 0;
+    }
+    return 1;
+}
+
+
+/**
+ * Clean up a context initialized with `RANDOM_ctx_init()`.
+ */
+static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) {
+  if (!ctx) {
+    return;
+  }
+  if (ctx->offsets) {
+    free(ctx->offsets);
+    ctx->offsets = NULL;
+  }
+}
+
+
+/**
+ * Prepare a context for dictionary building.
+ * Returns 1 on success or zero on error.
+ * The context must be destroyed with `RANDOM_ctx_destroy()`.
+ */
+static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer,
+                          const size_t *samplesSizes, unsigned nbSamples) {
+    const BYTE *const samples = (const BYTE *)samplesBuffer;
+    const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
+    const int displayLevel = 2;
+    /* Checks */
+    if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) {
+      DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
+                   (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20));
+      return 0;
+    }
+    memset(ctx, 0, sizeof(*ctx));
+    DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples,
+                 (U32)totalSamplesSize);
+    ctx->samples = samples;
+    ctx->samplesSizes = samplesSizes;
+    ctx->nbSamples = nbSamples;
+    ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
+    ctx->totalSamplesSize = (U32)totalSamplesSize;
+    if (!ctx->offsets) {
+      DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n");
+      RANDOM_ctx_destroy(ctx);
+      return 0;
+    }
+    {
+      U32 i;
+      ctx->offsets[0] = 0;
+      for (i = 1; i <= nbSamples; ++i) {
+        ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
+      }
+    }
+    return 1;
+}
+
+
+/**
+ * Given the prepared context build the dictionary.
+ */
+static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
+                                    size_t dictBufferCapacity,
+                                    ZDICT_random_params_t parameters) {
+    BYTE *const dict = (BYTE *)dictBuffer;
+    size_t tail = dictBufferCapacity;
+    const int displayLevel = parameters.zParams.notificationLevel;
+    while (tail > 0) {
+
+      /* Select a segment */
+      RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters);
+
+      size_t segmentSize;
+      segmentSize = MIN(segment.end - segment.begin + 1, tail);
+
+      tail -= segmentSize;
+      memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+      DISPLAYUPDATE(
+          2, "\r%u%%       ",
+          (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
+    }
+
+    return tail;
+}
+
+/*! ZDICT_trainFromBuffer_random():
+ *  Train a dictionary from an array of samples using the RANDOM algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters) {
+      const int displayLevel = parameters.zParams.notificationLevel;
+      BYTE* const dict = (BYTE*)dictBuffer;
+      RANDOM_ctx_t ctx;
+      /* Checks */
+      if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) {
+          DISPLAYLEVEL(1, "k is incorrect\n");
+          return ERROR(GENERIC);
+      }
+      if (nbSamples == 0) {
+        DISPLAYLEVEL(1, "Random must have at least one input file\n");
+        return ERROR(GENERIC);
+      }
+      if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+        DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                     ZDICT_DICTSIZE_MIN);
+        return ERROR(dstSize_tooSmall);
+      }
+
+      if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) {
+        return ERROR(GENERIC);
+      }
+      DISPLAYLEVEL(2, "Building dictionary\n");
+      {
+        const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters);
+        const size_t dictSize = ZDICT_finalizeDictionary(
+            dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+            samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
+        if (!ZSTD_isError(dictSize)) {
+            DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                          (U32)dictSize);
+        }
+        RANDOM_ctx_destroy(&ctx);
+        return dictSize;
+      }
+}
+
+
+int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                       const char** fileNamesTable, unsigned nbFiles,
+                       size_t chunkSize, ZDICT_random_params_t *params){
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+    int result = 0;
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
+        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    {   size_t dictSize;
+        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer,
+                                             sampleSizes, fs.nbSamples, *params);
+        DISPLAYLEVEL(2, "k=%u\n", params->k);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            goto _cleanup;
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+_cleanup:
+    free(srcBuffer);
+    free(sampleSizes);
+    free(dictBuffer);
+    return result;
+}
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
new file mode 100644
index 00000000..05879641
--- /dev/null
+++ b/contrib/randomDictBuilder/random.h
@@ -0,0 +1,53 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h> /* memset */
+#include <time.h>   /* clock */
+#include "zstd_internal.h" /* includes zstd.h */
+#ifndef ZDICT_STATIC_LINKING_ONLY
+#define ZDICT_STATIC_LINKING_ONLY
+#endif
+#include "zdict.h"
+
+
+/**************************************
+* Context
+***************************************/
+typedef struct {
+  const BYTE *samples;
+  size_t *offsets;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+  U32 totalSamplesSize;
+} RANDOM_ctx_t;
+
+/**
+ * A segment is an inclusive range in the source.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+} RANDOM_segment_t;
+
+
+typedef struct {
+    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
+    ZDICT_params_t zParams;
+} ZDICT_random_params_t;
+
+
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
+    void *dictBuffer, size_t dictBufferCapacity,
+    const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_random_params_t parameters);
+
+
+int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                        const char** fileNamesTable, unsigned nbFiles,
+                        size_t chunkSize, ZDICT_random_params_t *params);
diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh
new file mode 100644
index 00000000..552650ee
--- /dev/null
+++ b/contrib/randomDictBuilder/test.sh
@@ -0,0 +1,14 @@
+echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1"
+./main c=5 in=../../lib/common k=200 out=dict1
+zstd -be3 -D dict1 -r ../../lib/common -q
+echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
+./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../lib/common -q
+echo "Building random dictionary with 2 sample sources"
+./main in=../../lib/common in=../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../lib/common -q
+echo "Removing dict1 dict2 dict3"
+rm -f dict1 dict2 dict3
+
+echo "Testing with invalid parameters, should fail"
+! ./main r=10

From 31731df4dab0df7b465de2de5641b2e3416c9086 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 17:38:53 -0700
Subject: [PATCH 02/13] Remove clevel and update documentation

---
 contrib/randomDictBuilder/README.md | 15 ++++++++++-----
 contrib/randomDictBuilder/main.c    | 11 ++++++++---
 contrib/randomDictBuilder/test.sh   |  8 ++++----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index cadffdf2..de2c7ff6 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -1,12 +1,17 @@
 Random Dictionary Builder
 
 ### Permitted Arguments:
-Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
 Output Dictionary (out=dictName): if not provided, default to defaultDict
-Dictionary ID (dictID=#): positive number, if not provided, default to 0
-Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB
-Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200
-Compression Level (c=#): positive number, if not provided, default to 3
+Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
+Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
+Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
+Compression Level (c=#): positive number; if not provided, default to 3
+
+
+###Usage:
+To build a random dictionary with the provided arguments: make run ARG= followed by arguments
+
 
 ### Examples:
 make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 15eb5c44..cf0b9476 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -63,7 +63,7 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  unsigned cLevel = DEFAULT_CLEVEL;
+  /* Initialize parameters with default value */
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -76,10 +76,10 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
-    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
+      /* Allow multiple input files */
       inputFile = malloc(strlen(argument) + 1);
       strcpy(inputFile, argument);
       filenameTable[filenameIdx] = inputFile;
@@ -96,6 +96,11 @@ int main(int argCount, const char* argv[])
     return operationResult;
   }
 
+  if (maxDictSize == 0) {
+    DISPLAYLEVEL(1, "maxDictSize should not be 0.\n");
+    operationResult = 1;
+    return operationResult;
+  }
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
@@ -114,7 +119,7 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = cLevel;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;
diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh
index 552650ee..497820f8 100644
--- a/contrib/randomDictBuilder/test.sh
+++ b/contrib/randomDictBuilder/test.sh
@@ -1,8 +1,8 @@
-echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1"
-./main c=5 in=../../lib/common k=200 out=dict1
+echo "Building random dictionary with in=../../lib/common k=200 out=dict1"
+./main in=../../lib/common k=200 out=dict1
 zstd -be3 -D dict1 -r ../../lib/common -q
-echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
-./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
+./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
 zstd -be3 -D dict2 -r ../../lib/common -q
 echo "Building random dictionary with 2 sample sources"
 ./main in=../../lib/common in=../../lib/compress out=dict3

From 0e5fbc10facdce2def08e4f4ecb67d255694df3a Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Fri, 13 Jul 2018 17:41:09 -0700
Subject: [PATCH 03/13] Update README

---
 contrib/randomDictBuilder/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index de2c7ff6..09f1e808 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -6,7 +6,6 @@ Output Dictionary (out=dictName): if not provided, default to defaultDict
 Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
 Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
-Compression Level (c=#): positive number; if not provided, default to 3
 
 
 ###Usage:

From b5806d33db813dfb2bac7cd3b97b5bcf09ee57b7 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 16:03:04 -0700
Subject: [PATCH 04/13] Refactor RANDOM

---
 contrib/randomDictBuilder/Makefile |  12 +-
 contrib/randomDictBuilder/main.c   | 297 ++++++++++++++++++++++++-
 contrib/randomDictBuilder/random.c | 343 ++---------------------------
 contrib/randomDictBuilder/random.h |  23 --
 4 files changed, 314 insertions(+), 361 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index a2aade23..443f6f04 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -14,14 +14,14 @@ rand:
 	./main $(ARG)
 
 
-main: random.o main.o libzstd.a
-	gcc random.o main.o libzstd.a -o main
+main: main.o random.o libzstd.a
+	gcc main.o random.o libzstd.a -o main
 
-main.o: main.c
-	gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h
+main.o: main.c $(PROGRAM_FILES)
+	gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
-random.o: $(PROGRAM_FILES) random.c
-	gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c
+random.o: random.c
+	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
 
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index cf0b9476..d9295aa9 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -3,13 +3,45 @@
 #include <string.h>   /* strcmp, strlen */
 #include <errno.h>    /* errno */
 #include <ctype.h>
-#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
 #include "random.h"
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
 #include "util.h"
+#include "zdict.h"
 
+/*-*************************************
+*  Console display
+***************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+/*-*************************************
+*  Constants
+***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_CLEVEL 3
 #define DEFAULT_INPUTFILE ""
@@ -17,7 +49,33 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
 
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
+#define NOISELENGTH 32
+
+
+/*-*************************************
+*  Structs
+***************************************/
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+typedef struct {
+  const void* srcBuffer;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+}sampleInfo;
+
+
+/*-*************************************
+*  Commandline related functions
+***************************************/
 static unsigned readU32FromChar(const char** stringPtr)
 {
     const char errorMsg[] = "error: numeric value too large";
@@ -42,7 +100,6 @@ static unsigned readU32FromChar(const char** stringPtr)
     return result;
 }
 
-
 /** longCommandWArg() :
  *  check if *stringPtr is the same as longCommand.
  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
@@ -56,6 +113,225 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
     return result;
 }
 
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
+                              size_t* sampleSizes, unsigned sstSize,
+                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
+                              unsigned displayLevel)
+{
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem)
+{
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+static void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize)
+{
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+{
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize,
+                       ZDICT_random_params_t *params){
+    unsigned const displayLevel = params->zParams.notificationLevel;
+    void* const dictBuffer = malloc(maxDictSize);
+
+    int result = 0;
+
+    /* Checks */
+    if (!dictBuffer)
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+
+    {   size_t dictSize;
+        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
+                                             info->samplesSizes, info->nbSamples, *params);
+        DISPLAYLEVEL(2, "k=%u\n", params->k);
+        if (ZDICT_isError(dictSize)) {
+            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
+            result = 1;
+            free(dictBuffer);
+        }
+        /* save dict */
+        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
+        saveDict(dictFileName, dictBuffer, dictSize);
+    }
+
+    /* clean up */
+    free(dictBuffer);
+    return result;
+}
+
+sampleInfo* getSampleInfo(const char** fileNamesTable,
+                  unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer))
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
+
+    info->nbSamples = fs.nbSamples;
+    info->samplesSizes = sampleSizes;
+    info->srcBuffer = srcBuffer;
+
+    return info;
+}
+
+
 
 int main(int argCount, const char* argv[])
 {
@@ -63,7 +339,7 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  /* Initialize parameters with default value */
+  unsigned cLevel = DEFAULT_CLEVEL;
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -76,10 +352,10 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
+    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
-      /* Allow multiple input files */
       inputFile = malloc(strlen(argument) + 1);
       strcpy(inputFile, argument);
       filenameTable[filenameIdx] = inputFile;
@@ -96,12 +372,6 @@ int main(int argCount, const char* argv[])
     return operationResult;
   }
 
-  if (maxDictSize == 0) {
-    DISPLAYLEVEL(1, "maxDictSize should not be 0.\n");
-    operationResult = 1;
-    return operationResult;
-  }
-
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
   int followLinks = 0;
@@ -119,12 +389,15 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = DEFAULT_CLEVEL;
+  zParams.compressionLevel = cLevel;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;
   params.k = k;
 
-  operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, &params);
+  sampleInfo* info= getSampleInfo(filenameTable,
+                    filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
+  operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
+
   return operationResult;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index a59427ba..96c02389 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -5,24 +5,12 @@
 #include <stdlib.h>           /* malloc, free, qsort */
 #include <string.h>           /* memset */
 #include <time.h>             /* clock */
-#include "zstd_internal.h" /* includes zstd.h */
+#include "random.h"
+#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
 #ifndef ZDICT_STATIC_LINKING_ONLY
 #define ZDICT_STATIC_LINKING_ONLY
 #endif
-#include "random.h"
-#include "platform.h"         /* Large Files support */
-#include "util.h"             /* UTIL_getFileSize, UTIL_getTotalFileSize */
-
-/*-*************************************
-*  Constants
-***************************************/
-#define SAMPLESIZE_MAX (128 KB)
-#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
-#define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
-
-#define NOISELENGTH 32
-#define DEFAULT_K 200
+#include "zdict.h"
 
 /*-*************************************
 *  Console display
@@ -30,179 +18,16 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((siz
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
 
-static const U64 g_refreshRate = SEC_TO_MICRO / 6;
-static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
-
-#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
-            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
-            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
-            if (displayLevel>=4) fflush(stderr); } } }
-
-
-/*-*************************************
-*  Exceptions
-***************************************/
-#ifndef DEBUG
-#  define DEBUG 0
-#endif
-#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
-#define EXM_THROW(error, ...)                                             \
-{                                                                         \
-    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
-    DISPLAY("Error %i : ", error);                                        \
-    DISPLAY(__VA_ARGS__);                                                 \
-    DISPLAY("\n");                                                        \
-    exit(error);                                                          \
-}
-
-
-/* ********************************************************
-*  File related operations
-**********************************************************/
-/** loadFiles() :
- *  load samples from files listed in fileNamesTable into buffer.
- *  works even if buffer is too small to load all samples.
- *  Also provides the size of each sample into sampleSizes table
- *  which must be sized correctly, using DiB_fileStats().
- * @return : nb of samples effectively loaded into `buffer`
- * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
- *  sampleSizes is filled with the size of each sample.
- */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
-                              size_t* sampleSizes, unsigned sstSize,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
-                              unsigned displayLevel)
-{
-    char* const buff = (char*)buffer;
-    size_t pos = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
-
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
-        const char* const fileName = fileNamesTable[fileIndex];
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
-        U32 cnb;
-        FILE* const f = fopen(fileName, "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
-        for (cnb=0; cnb<nbChunks; cnb++) {
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
-            if (toLoad > *bufferSizePtr-pos) break;
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
-                pos += readSize;
-                sampleSizes[nbLoadedChunks++] = toLoad;
-                remainingToLoad -= targetChunkSize;
-                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
-                    fileIndex = nbFiles;  /* stop there */
-                    break;
-                }
-                if (toLoad < targetChunkSize) {
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
-        }   }   }
-        fclose(f);
-    }
-    DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
-    return nbLoadedChunks;
-}
-
-
-
-#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
-static U32 getRand(U32* src)
-{
-    static const U32 prime1 = 2654435761U;
-    static const U32 prime2 = 2246822519U;
-    U32 rand32 = *src;
-    rand32 *= prime1;
-    rand32 ^= prime2;
-    rand32  = rotl32(rand32, 13);
-    *src = rand32;
-    return rand32 >> 5;
-}
-
-
-/* shuffle() :
- * shuffle a table of file names in a semi-random way
- * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
- * it will load random elements from it, instead of just the first ones. */
-static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
-    U32 seed = 0xFD2FB528;
-    unsigned i;
-    for (i = nbFiles - 1; i > 0; --i) {
-        unsigned const j = getRand(&seed) % (i + 1);
-        const char* const tmp = fileNamesTable[j];
-        fileNamesTable[j] = fileNamesTable[i];
-        fileNamesTable[i] = tmp;
-    }
-}
-
-
-
-/*-********************************************************
-*  Dictionary training functions
-**********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem)
-{
-    size_t const step = 8 MB;
-    void* testmem = NULL;
-
-    requiredMem = (((requiredMem >> 23) + 1) << 23);
-    requiredMem += step;
-    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
-
-    while (!testmem) {
-        testmem = malloc((size_t)requiredMem);
-        requiredMem -= step;
-    }
-
-    free(testmem);
-    return (size_t)requiredMem;
-}
-
-static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize)
-{
-    FILE* const f = fopen(dictFileName, "wb");
-    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
-
-    { size_t const n = fwrite(buff, 1, buffSize, f);
-      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
-
-    { size_t const n = (size_t)fclose(f);
-      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
-}
-
-/*! getFileStats() :
- *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
- *  provides the amount of data to be loaded and the resulting nb of samples.
- *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
- */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
-{
-    fileStats fs;
-    unsigned n;
-    memset(&fs, 0, sizeof(fs));
-    for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
-        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
-        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
-        fs.nbSamples += nbSamples;
-    }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
-    return fs;
-}
-
-
+#define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
+  if (displayLevel >= l) {                                                     \
+    if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) {             \
+      g_time = clock();                                                        \
+      DISPLAY(__VA_ARGS__);                                                    \
+    }                                                                          \
+  }
+#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__)
+static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100;
+static clock_t g_time = 0;
 
 
 
@@ -225,16 +50,14 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
 /**
  * Selects a random segment from totalSamplesSize - k + 1 possible segments
  */
-static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx,
+static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize,
                                             ZDICT_random_params_t parameters) {
     const U32 k = parameters.k;
     RANDOM_segment_t segment;
     unsigned index;
 
-    /* Seed random number generator */
-    srand((unsigned)time(NULL));
     /* Randomly generate a number from 0 to sampleSizes - k */
-    index = rand()%(ctx->totalSamplesSize - k + 1);
+    index = rand()%(totalSamplesSize - k + 1);
 
     /* inclusive */
     segment.begin = index;
@@ -261,65 +84,11 @@ static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDi
 }
 
 
-/**
- * Clean up a context initialized with `RANDOM_ctx_init()`.
- */
-static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) {
-  if (!ctx) {
-    return;
-  }
-  if (ctx->offsets) {
-    free(ctx->offsets);
-    ctx->offsets = NULL;
-  }
-}
-
-
-/**
- * Prepare a context for dictionary building.
- * Returns 1 on success or zero on error.
- * The context must be destroyed with `RANDOM_ctx_destroy()`.
- */
-static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer,
-                          const size_t *samplesSizes, unsigned nbSamples) {
-    const BYTE *const samples = (const BYTE *)samplesBuffer;
-    const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
-    const int displayLevel = 2;
-    /* Checks */
-    if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) {
-      DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
-                   (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20));
-      return 0;
-    }
-    memset(ctx, 0, sizeof(*ctx));
-    DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples,
-                 (U32)totalSamplesSize);
-    ctx->samples = samples;
-    ctx->samplesSizes = samplesSizes;
-    ctx->nbSamples = nbSamples;
-    ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
-    ctx->totalSamplesSize = (U32)totalSamplesSize;
-    if (!ctx->offsets) {
-      DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n");
-      RANDOM_ctx_destroy(ctx);
-      return 0;
-    }
-    {
-      U32 i;
-      ctx->offsets[0] = 0;
-      for (i = 1; i <= nbSamples; ++i) {
-        ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
-      }
-    }
-    return 1;
-}
-
-
 /**
  * Given the prepared context build the dictionary.
  */
-static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
-                                    size_t dictBufferCapacity,
+static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples,
+                                    void *dictBuffer, size_t dictBufferCapacity,
                                     ZDICT_random_params_t parameters) {
     BYTE *const dict = (BYTE *)dictBuffer;
     size_t tail = dictBufferCapacity;
@@ -327,13 +96,13 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
     while (tail > 0) {
 
       /* Select a segment */
-      RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters);
+      RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters);
 
       size_t segmentSize;
       segmentSize = MIN(segment.end - segment.begin + 1, tail);
 
       tail -= segmentSize;
-      memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
+      memcpy(dict + tail, samples + segment.begin, segmentSize);
       DISPLAYUPDATE(
           2, "\r%u%%       ",
           (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
@@ -342,6 +111,7 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer,
     return tail;
 }
 
+
 /*! ZDICT_trainFromBuffer_random():
  *  Train a dictionary from an array of samples using the RANDOM algorithm.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
@@ -356,7 +126,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     ZDICT_random_params_t parameters) {
       const int displayLevel = parameters.zParams.notificationLevel;
       BYTE* const dict = (BYTE*)dictBuffer;
-      RANDOM_ctx_t ctx;
       /* Checks */
       if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) {
           DISPLAYLEVEL(1, "k is incorrect\n");
@@ -371,13 +140,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
                      ZDICT_DICTSIZE_MIN);
         return ERROR(dstSize_tooSmall);
       }
+      const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples);
+      const BYTE *const samples = (const BYTE *)samplesBuffer;
 
-      if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) {
-        return ERROR(GENERIC);
-      }
       DISPLAYLEVEL(2, "Building dictionary\n");
       {
-        const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters);
+        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters);
         const size_t dictSize = ZDICT_finalizeDictionary(
             dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
             samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
@@ -385,71 +153,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
             DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
                           (U32)dictSize);
         }
-        RANDOM_ctx_destroy(&ctx);
         return dictSize;
       }
 }
-
-
-int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                       const char** fileNamesTable, unsigned nbFiles,
-                       size_t chunkSize, ZDICT_random_params_t *params){
-    unsigned const displayLevel = params->zParams.notificationLevel;
-    void* const dictBuffer = malloc(maxDictSize);
-    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = RANDOM_MEMMULT;
-    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
-    int result = 0;
-
-    /* Checks */
-    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
-        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
-    if (fs.oneSampleTooLarge) {
-        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
-        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
-        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
-    }
-    if (fs.nbSamples < 5) {
-        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
-        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
-        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
-        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
-    }
-    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
-        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
-        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
-    }
-
-    /* init */
-    if (loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
-
-    /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
-    shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
-
-    {   size_t dictSize;
-        dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer,
-                                             sampleSizes, fs.nbSamples, *params);
-        DISPLAYLEVEL(2, "k=%u\n", params->k);
-        if (ZDICT_isError(dictSize)) {
-            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
-            result = 1;
-            goto _cleanup;
-        }
-        /* save dict */
-        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
-        saveDict(dictFileName, dictBuffer, dictSize);
-    }
-
-    /* clean up */
-_cleanup:
-    free(srcBuffer);
-    free(sampleSizes);
-    free(dictBuffer);
-    return result;
-}
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index 05879641..77529daf 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -8,18 +8,6 @@
 #endif
 #include "zdict.h"
 
-
-/**************************************
-* Context
-***************************************/
-typedef struct {
-  const BYTE *samples;
-  size_t *offsets;
-  const size_t *samplesSizes;
-  size_t nbSamples;
-  U32 totalSamplesSize;
-} RANDOM_ctx_t;
-
 /**
  * A segment is an inclusive range in the source.
  */
@@ -35,19 +23,8 @@ typedef struct {
 } ZDICT_random_params_t;
 
 
-typedef struct {
-    U64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
-    unsigned nbSamples;
-} fileStats;
-
 
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);
-
-
-int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
-                        const char** fileNamesTable, unsigned nbFiles,
-                        size_t chunkSize, ZDICT_random_params_t *params);

From 1f7fa5cdd6555e22dfa8c2dc1f5c17293e703fe3 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 16:31:59 -0700
Subject: [PATCH 05/13] Fix spacing and Edit Makefile (now run with make
 instead of make run)

---
 contrib/randomDictBuilder/Makefile  | 13 +++++----
 contrib/randomDictBuilder/README.md |  9 ++++---
 contrib/randomDictBuilder/main.c    | 42 ++++++++++++++---------------
 contrib/randomDictBuilder/random.c  |  9 ++++---
 contrib/randomDictBuilder/random.h  |  5 ++--
 5 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 443f6f04..77dd2933 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -4,16 +4,15 @@ TEST_INPUT := ../../lib
 TEST_OUTPUT := randomDict
 ARG :=
 
-all: main testrun test clean
+all: main run clean
 
-run: main rand clean
+test: main testrun testshell clean
 
-.PHONY: rand
-rand:
+.PHONY: run
+run:
 	echo "Building a random dictionary with given arguments"
 	./main $(ARG)
 
-
 main: main.o random.o libzstd.a
 	gcc main.o random.o libzstd.a -o main
 
@@ -34,8 +33,8 @@ testrun: main
 	zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q
 	rm -f $(TEST_OUTPUT)
 
-.PHONY: test
-test: test.sh
+.PHONY: testshell
+testshell: test.sh
 	sh test.sh
 	echo "Finish running test.sh"
 
diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md
index 09f1e808..0e70d3dc 100644
--- a/contrib/randomDictBuilder/README.md
+++ b/contrib/randomDictBuilder/README.md
@@ -7,11 +7,14 @@ Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0
 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB
 Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200
 
+###Running Test:
+make test
+
 
 ###Usage:
-To build a random dictionary with the provided arguments: make run ARG= followed by arguments
+To build a random dictionary with the provided arguments: make ARG= followed by arguments
 
 
 ### Examples:
-make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
-make run ARG="in=../../lib/dictBuilder in=../../lib/compress"
+make ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../lib/dictBuilder in=../../lib/compress"
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index d9295aa9..e195188b 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -52,7 +52,8 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define SAMPLESIZE_MAX (128 KB)
 #define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
 #define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
+                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
 
@@ -76,8 +77,7 @@ typedef struct {
 /*-*************************************
 *  Commandline related functions
 ***************************************/
-static unsigned readU32FromChar(const char** stringPtr)
-{
+static unsigned readU32FromChar(const char** stringPtr){
     const char errorMsg[] = "error: numeric value too large";
     unsigned result = 0;
     while ((**stringPtr >='0') && (**stringPtr <='9')) {
@@ -105,8 +105,7 @@ static unsigned readU32FromChar(const char** stringPtr)
  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
  * @return 0 and doesn't modify *stringPtr otherwise.
  */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
-{
+static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
     size_t const comSize = strlen(longCommand);
     int const result = !strncmp(*stringPtr, longCommand, comSize);
     if (result) *stringPtr += comSize;
@@ -125,11 +124,9 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
  *  sampleSizes is filled with the size of each sample.
  */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr,
-                              size_t* sampleSizes, unsigned sstSize,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
-                              unsigned displayLevel)
-{
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
+                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
+                          size_t targetChunkSize, unsigned displayLevel) {
     char* const buff = (char*)buffer;
     size_t pos = 0;
     unsigned nbLoadedChunks = 0, fileIndex;
@@ -200,8 +197,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
 /*-********************************************************
 *  Dictionary training functions
 **********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem)
-{
+static size_t findMaxMem(unsigned long long requiredMem) {
     size_t const step = 8 MB;
     void* testmem = NULL;
 
@@ -219,8 +215,7 @@ static size_t findMaxMem(unsigned long long requiredMem)
 }
 
 static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize)
-{
+                         const void* buff, size_t buffSize) {
     FILE* const f = fopen(dictFileName, "wb");
     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
 
@@ -236,8 +231,8 @@ static void saveDict(const char* dictFileName,
  *  provides the amount of data to be loaded and the resulting nb of samples.
  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
  */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
-{
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
+                              size_t chunkSize, unsigned displayLevel) {
     fileStats fs;
     unsigned n;
     memset(&fs, 0, sizeof(fs));
@@ -255,8 +250,9 @@ static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, siz
     return fs;
 }
 
-int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize,
-                       ZDICT_random_params_t *params){
+int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
+                          unsigned maxDictSize,
+                          ZDICT_random_params_t *params) {
     unsigned const displayLevel = params->zParams.notificationLevel;
     void* const dictBuffer = malloc(maxDictSize);
 
@@ -285,8 +281,8 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned m
     return result;
 }
 
-sampleInfo* getSampleInfo(const char** fileNamesTable,
-                  unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel) {
     fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
     size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
     size_t const memMult = RANDOM_MEMMULT;
@@ -320,7 +316,8 @@ sampleInfo* getSampleInfo(const char** fileNamesTable,
     /* Load input buffer */
     DISPLAYLEVEL(3, "Shuffling input files\n");
     shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
+                        fileNamesTable, nbFiles, chunkSize, displayLevel);
 
     sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
 
@@ -376,7 +373,8 @@ int main(int argCount, const char* argv[])
   unsigned fileNamesNb = filenameIdx;
   int followLinks = 0;
   const char** extendedFileList = NULL;
-  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks);
+  extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
+                                        &fileNamesNb, followLinks);
   if (extendedFileList) {
       unsigned u;
       for (u=0; u<fileNamesNb; u++) DISPLAYLEVEL(4, "%u %s\n", u, extendedFileList[u]);
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index 96c02389..cfed14a4 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -71,7 +71,8 @@ static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize,
  * Check the validity of the parameters.
  * Returns non-zero if the parameters are valid and 0 otherwise.
  */
-static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) {
+static int RANDOM_checkParameters(ZDICT_random_params_t parameters,
+                                  size_t maxDictSize) {
     /* k is a required parameter */
     if (parameters.k == 0) {
       return 0;
@@ -115,7 +116,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *
 /*! ZDICT_trainFromBuffer_random():
  *  Train a dictionary from an array of samples using the RANDOM algorithm.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each
+ *  sample, in order.
  *  The resulting dictionary will be saved into `dictBuffer`.
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
@@ -145,7 +147,8 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
 
       DISPLAYLEVEL(2, "Building dictionary\n");
       {
-        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters);
+        const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples,
+                                  dictBuffer, dictBufferCapacity, parameters);
         const size_t dictSize = ZDICT_finalizeDictionary(
             dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
             samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index 77529daf..b6696323 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -18,13 +18,12 @@ typedef struct {
 
 
 typedef struct {
-    unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
+    unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */
     ZDICT_params_t zParams;
 } ZDICT_random_params_t;
 
 
 
-ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
-    void *dictBuffer, size_t dictBufferCapacity,
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);

From 4d32339b75c98c4534963eb73a55ae7c4826214e Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Mon, 16 Jul 2018 18:59:18 -0700
Subject: [PATCH 06/13] Remove CLevel cli option which was accidentally added
 back in the last commit

---
 contrib/randomDictBuilder/main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index e195188b..e66f2847 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -336,7 +336,6 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  unsigned cLevel = DEFAULT_CLEVEL;
   char* inputFile = DEFAULT_INPUTFILE;
   unsigned k = DEFAULT_k;
   char* outputFile = DEFAULT_OUTPUTFILE;
@@ -349,7 +348,6 @@ int main(int argCount, const char* argv[])
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
-    if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
@@ -387,7 +385,7 @@ int main(int argCount, const char* argv[])
 
   ZDICT_random_params_t params;
   ZDICT_params_t zParams;
-  zParams.compressionLevel = cLevel;
+  zParams.compressionLevel = DEFAULT_CLEVEL;
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
   params.zParams = zParams;

From 49acfaeaec44a25c4628a2512965445152e8776a Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 12:35:09 -0700
Subject: [PATCH 07/13] Move file loading functions to new file for access by
 benchmarking tool

---
 contrib/randomDictBuilder/Makefile |  11 +-
 contrib/randomDictBuilder/io.c     | 243 +++++++++++++++++++++++++++++
 contrib/randomDictBuilder/io.h     |  33 ++++
 contrib/randomDictBuilder/main.c   | 215 +------------------------
 4 files changed, 290 insertions(+), 212 deletions(-)
 create mode 100644 contrib/randomDictBuilder/io.c
 create mode 100644 contrib/randomDictBuilder/io.h

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 77dd2933..8360a409 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -13,15 +13,18 @@ run:
 	echo "Building a random dictionary with given arguments"
 	./main $(ARG)
 
-main: main.o random.o libzstd.a
-	gcc main.o random.o libzstd.a -o main
+main: main.o io.o random.o libzstd.a
+	gcc main.o io.o random.o libzstd.a -o main
 
-main.o: main.c $(PROGRAM_FILES)
-	gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+main.o: main.c
+	gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
 random.o: random.c
 	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
 
+io.o: io.c $(PROGRAM_FILES)
+	gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
 	mv ../../lib/libzstd.a .
diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
new file mode 100644
index 00000000..a5f71498
--- /dev/null
+++ b/contrib/randomDictBuilder/io.c
@@ -0,0 +1,243 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "io.h"
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+/*-*************************************
+*  Console display
+***************************************/
+#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+
+static const U64 g_refreshRate = SEC_TO_MICRO / 6;
+static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
+
+#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
+            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
+            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
+            if (displayLevel>=4) fflush(stderr); } } }
+
+/*-*************************************
+*  Exceptions
+***************************************/
+#ifndef DEBUG
+#  define DEBUG 0
+#endif
+#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
+#define EXM_THROW(error, ...)                                             \
+{                                                                         \
+    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
+    DISPLAY("Error %i : ", error);                                        \
+    DISPLAY(__VA_ARGS__);                                                 \
+    DISPLAY("\n");                                                        \
+    exit(error);                                                          \
+}
+
+
+/*-*************************************
+*  Constants
+***************************************/
+
+#define SAMPLESIZE_MAX (128 KB)
+#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
+#define RANDOM_MEMMULT 9
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
+                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+
+#define NOISELENGTH 32
+
+
+
+/* ********************************************************
+*  File related operations
+**********************************************************/
+/** loadFiles() :
+ *  load samples from files listed in fileNamesTable into buffer.
+ *  works even if buffer is too small to load all samples.
+ *  Also provides the size of each sample into sampleSizes table
+ *  which must be sized correctly, using DiB_fileStats().
+ * @return : nb of samples effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
+ *  sampleSizes is filled with the size of each sample.
+ */
+static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
+                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
+                          size_t targetChunkSize, unsigned displayLevel) {
+    char* const buff = (char*)buffer;
+    size_t pos = 0;
+    unsigned nbLoadedChunks = 0, fileIndex;
+
+    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+        const char* const fileName = fileNamesTable[fileIndex];
+        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+        U32 cnb;
+        FILE* const f = fopen(fileName, "rb");
+        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+        for (cnb=0; cnb<nbChunks; cnb++) {
+            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+            if (toLoad > *bufferSizePtr-pos) break;
+            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                pos += readSize;
+                sampleSizes[nbLoadedChunks++] = toLoad;
+                remainingToLoad -= targetChunkSize;
+                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
+                    fileIndex = nbFiles;  /* stop there */
+                    break;
+                }
+                if (toLoad < targetChunkSize) {
+                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        }   }   }
+        fclose(f);
+    }
+    DISPLAYLEVEL(2, "\r%79s\r", "");
+    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
+    return nbLoadedChunks;
+}
+
+#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
+static U32 getRand(U32* src)
+{
+    static const U32 prime1 = 2654435761U;
+    static const U32 prime2 = 2246822519U;
+    U32 rand32 = *src;
+    rand32 *= prime1;
+    rand32 ^= prime2;
+    rand32  = rotl32(rand32, 13);
+    *src = rand32;
+    return rand32 >> 5;
+}
+
+/* shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
+static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = getRand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
+}
+
+
+/*-********************************************************
+*  Dictionary training functions
+**********************************************************/
+static size_t findMaxMem(unsigned long long requiredMem) {
+    size_t const step = 8 MB;
+    void* testmem = NULL;
+
+    requiredMem = (((requiredMem >> 23) + 1) << 23);
+    requiredMem += step;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
+
+    while (!testmem) {
+        testmem = malloc((size_t)requiredMem);
+        requiredMem -= step;
+    }
+
+    free(testmem);
+    return (size_t)requiredMem;
+}
+
+void saveDict(const char* dictFileName,
+                         const void* buff, size_t buffSize) {
+    FILE* const f = fopen(dictFileName, "wb");
+    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
+
+    { size_t const n = fwrite(buff, 1, buffSize, f);
+      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
+
+    { size_t const n = (size_t)fclose(f);
+      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
+}
+
+/*! getFileStats() :
+ *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
+ *  provides the amount of data to be loaded and the resulting nb of samples.
+ *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
+ */
+static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
+                              size_t chunkSize, unsigned displayLevel) {
+    fileStats fs;
+    unsigned n;
+    memset(&fs, 0, sizeof(fs));
+    for (n=0; n<nbFiles; n++) {
+        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
+        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
+        fs.nbSamples += nbSamples;
+    }
+    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
+    return fs;
+}
+
+
+
+
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel) {
+    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+    size_t const memMult = RANDOM_MEMMULT;
+    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
+    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+
+    /* Checks */
+    if ((!sampleSizes) || (!srcBuffer))
+        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
+    if (fs.oneSampleTooLarge) {
+        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
+        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
+        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
+    }
+    if (fs.nbSamples < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
+        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
+        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
+        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
+    }
+    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
+        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
+        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
+    }
+
+    /* init */
+    if (loadedSize < fs.totalSizeToLoad)
+        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+
+    /* Load input buffer */
+    DISPLAYLEVEL(3, "Shuffling input files\n");
+    shuffle(fileNamesTable, nbFiles);
+    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
+                        fileNamesTable, nbFiles, chunkSize, displayLevel);
+
+    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
+
+    info->nbSamples = fs.nbSamples;
+    info->samplesSizes = sampleSizes;
+    info->srcBuffer = srcBuffer;
+
+    return info;
+}
diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h
new file mode 100644
index 00000000..4b5639fe
--- /dev/null
+++ b/contrib/randomDictBuilder/io.h
@@ -0,0 +1,33 @@
+#include <stdio.h>  /* fprintf */
+#include <stdlib.h> /* malloc, free, qsort */
+#include <string.h>   /* strcmp, strlen */
+#include <errno.h>    /* errno */
+#include <ctype.h>
+#include "zstd_internal.h" /* includes zstd.h */
+#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
+#include "platform.h"         /* Large Files support */
+#include "util.h"
+#include "zdict.h"
+
+
+/*-*************************************
+*  Structs
+***************************************/
+typedef struct {
+    U64 totalSizeToLoad;
+    unsigned oneSampleTooLarge;
+    unsigned nbSamples;
+} fileStats;
+
+typedef struct {
+  const void* srcBuffer;
+  const size_t *samplesSizes;
+  size_t nbSamples;
+}sampleInfo;
+
+
+sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                          unsigned maxDictSize, const unsigned displayLevel);
+
+
+void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index e66f2847..34a9d99e 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -4,11 +4,11 @@
 #include <errno.h>    /* errno */
 #include <ctype.h>
 #include "random.h"
-#include "fileio.h"   /* stdinmark, stdoutmark, ZSTD_EXTENSION */
-#include "platform.h"         /* Large Files support */
+#include "io.h"
 #include "util.h"
 #include "zdict.h"
 
+
 /*-*************************************
 *  Console display
 ***************************************/
@@ -23,6 +23,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
             if (displayLevel>=4) fflush(stderr); } } }
 
+
 /*-*************************************
 *  Exceptions
 ***************************************/
@@ -39,6 +40,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
     exit(error);                                                          \
 }
 
+
 /*-*************************************
 *  Constants
 ***************************************/
@@ -49,29 +51,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
 
-#define SAMPLESIZE_MAX (128 KB)
-#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB))
-#define RANDOM_MEMMULT 9
-static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
-                          (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
-
-#define NOISELENGTH 32
-
-
-/*-*************************************
-*  Structs
-***************************************/
-typedef struct {
-    U64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
-    unsigned nbSamples;
-} fileStats;
-
-typedef struct {
-  const void* srcBuffer;
-  const size_t *samplesSizes;
-  size_t nbSamples;
-}sampleInfo;
 
 
 /*-*************************************
@@ -112,144 +91,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
     return result;
 }
 
-/* ********************************************************
-*  File related operations
-**********************************************************/
-/** loadFiles() :
- *  load samples from files listed in fileNamesTable into buffer.
- *  works even if buffer is too small to load all samples.
- *  Also provides the size of each sample into sampleSizes table
- *  which must be sized correctly, using DiB_fileStats().
- * @return : nb of samples effectively loaded into `buffer`
- * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
- *  sampleSizes is filled with the size of each sample.
- */
-static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes,
-                          unsigned sstSize, const char** fileNamesTable, unsigned nbFiles,
-                          size_t targetChunkSize, unsigned displayLevel) {
-    char* const buff = (char*)buffer;
-    size_t pos = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
-
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
-        const char* const fileName = fileNamesTable[fileIndex];
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
-        U32 cnb;
-        FILE* const f = fopen(fileName, "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
-        for (cnb=0; cnb<nbChunks; cnb++) {
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
-            if (toLoad > *bufferSizePtr-pos) break;
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
-                pos += readSize;
-                sampleSizes[nbLoadedChunks++] = toLoad;
-                remainingToLoad -= targetChunkSize;
-                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
-                    fileIndex = nbFiles;  /* stop there */
-                    break;
-                }
-                if (toLoad < targetChunkSize) {
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
-        }   }   }
-        fclose(f);
-    }
-    DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10))
-    return nbLoadedChunks;
-}
-
-#define rotl32(x,r) ((x << r) | (x >> (32 - r)))
-static U32 getRand(U32* src)
-{
-    static const U32 prime1 = 2654435761U;
-    static const U32 prime2 = 2246822519U;
-    U32 rand32 = *src;
-    rand32 *= prime1;
-    rand32 ^= prime2;
-    rand32  = rotl32(rand32, 13);
-    *src = rand32;
-    return rand32 >> 5;
-}
-
-/* shuffle() :
- * shuffle a table of file names in a semi-random way
- * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
- * it will load random elements from it, instead of just the first ones. */
-static void shuffle(const char** fileNamesTable, unsigned nbFiles) {
-    U32 seed = 0xFD2FB528;
-    unsigned i;
-    for (i = nbFiles - 1; i > 0; --i) {
-        unsigned const j = getRand(&seed) % (i + 1);
-        const char* const tmp = fileNamesTable[j];
-        fileNamesTable[j] = fileNamesTable[i];
-        fileNamesTable[i] = tmp;
-    }
-}
 
 
-/*-********************************************************
-*  Dictionary training functions
-**********************************************************/
-static size_t findMaxMem(unsigned long long requiredMem) {
-    size_t const step = 8 MB;
-    void* testmem = NULL;
-
-    requiredMem = (((requiredMem >> 23) + 1) << 23);
-    requiredMem += step;
-    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
-
-    while (!testmem) {
-        testmem = malloc((size_t)requiredMem);
-        requiredMem -= step;
-    }
-
-    free(testmem);
-    return (size_t)requiredMem;
-}
-
-static void saveDict(const char* dictFileName,
-                         const void* buff, size_t buffSize) {
-    FILE* const f = fopen(dictFileName, "wb");
-    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
-
-    { size_t const n = fwrite(buff, 1, buffSize, f);
-      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
-
-    { size_t const n = (size_t)fclose(f);
-      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
-}
-
-/*! getFileStats() :
- *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
- *  provides the amount of data to be loaded and the resulting nb of samples.
- *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
- */
-static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles,
-                              size_t chunkSize, unsigned displayLevel) {
-    fileStats fs;
-    unsigned n;
-    memset(&fs, 0, sizeof(fs));
-    for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
-        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
-        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
-        fs.nbSamples += nbSamples;
-    }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10));
-    return fs;
-}
-
+/*-*************************************
+*  RANDOM
+***************************************/
 int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
                           unsigned maxDictSize,
                           ZDICT_random_params_t *params) {
@@ -281,53 +127,6 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     return result;
 }
 
-sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
-                          unsigned maxDictSize, const unsigned displayLevel) {
-    fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
-    size_t const memMult = RANDOM_MEMMULT;
-    size_t const maxMem =  findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
-
-    /* Checks */
-    if ((!sampleSizes) || (!srcBuffer))
-        EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
-    if (fs.oneSampleTooLarge) {
-        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
-        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
-        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
-    }
-    if (fs.nbSamples < 5) {
-        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
-        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
-        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
-        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
-    }
-    if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) {
-        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
-        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
-    }
-
-    /* init */
-    if (loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
-
-    /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
-    shuffle(fileNamesTable, nbFiles);
-    nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples,
-                        fileNamesTable, nbFiles, chunkSize, displayLevel);
-
-    sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo));
-
-    info->nbSamples = fs.nbSamples;
-    info->samplesSizes = sampleSizes;
-    info->srcBuffer = srcBuffer;
-
-    return info;
-}
-
 
 
 int main(int argCount, const char* argv[])

From e6fe4058388c820444a80d9d10aa5d840fab3c0c Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 12:42:53 -0700
Subject: [PATCH 08/13] Make test PHONY target

---
 contrib/randomDictBuilder/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 8360a409..678ff28a 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -6,6 +6,7 @@ ARG :=
 
 all: main run clean
 
+.PHONY: test
 test: main testrun testshell clean
 
 .PHONY: run

From 896ff0644a2531a22edf78ea9cb6b58a4de9c77f Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 16:01:44 -0700
Subject: [PATCH 09/13] Fix deallocation problem and add documentation

---
 contrib/randomDictBuilder/io.c     |  7 +++++++
 contrib/randomDictBuilder/io.h     | 17 +++++++++++++++++
 contrib/randomDictBuilder/main.c   | 20 +++++++++++---------
 contrib/randomDictBuilder/random.c | 11 ++---------
 contrib/randomDictBuilder/random.h |  9 ++++++++-
 5 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
index a5f71498..1c3eda58 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/randomDictBuilder/io.c
@@ -241,3 +241,10 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t
 
     return info;
 }
+
+
+void freeSampleInfo(sampleInfo *info) {
+    if (info->samplesSizes) free((void*)(info->samplesSizes));
+    if (info->srcBuffer) free((void*)(info->srcBuffer));
+    free(info);
+}
diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h
index 4b5639fe..55967f76 100644
--- a/contrib/randomDictBuilder/io.h
+++ b/contrib/randomDictBuilder/io.h
@@ -26,8 +26,25 @@ typedef struct {
 }sampleInfo;
 
 
+
+/*! getSampleInfo():
+ *  Load from input files and add samples to buffer
+ * @return: a sampleInfo struct containing infomation about buffer where samples are stored,
+ *          size of each sample, and total number of samples
+ */
 sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
                           unsigned maxDictSize, const unsigned displayLevel);
 
 
+
+/*! freeSampleInfo():
+ *  Free memory allocated for info
+ */
+void freeSampleInfo(sampleInfo *info);
+
+
+
+/*! saveDict():
+ *  Save data stored on buff to dictFileName
+ */
 void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 34a9d99e..1f12c7a4 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -46,7 +46,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 ***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
 #define DEFAULT_CLEVEL 3
-#define DEFAULT_INPUTFILE ""
 #define DEFAULT_k 200
 #define DEFAULT_OUTPUTFILE "defaultDict"
 #define DEFAULT_DICTID 0
@@ -135,30 +134,29 @@ int main(int argCount, const char* argv[])
   const char* programName = argv[0];
   int operationResult = 0;
 
-  char* inputFile = DEFAULT_INPUTFILE;
+  /* Initialize arguments to default values */
   unsigned k = DEFAULT_k;
-  char* outputFile = DEFAULT_OUTPUTFILE;
+  const char* outputFile = DEFAULT_OUTPUTFILE;
   unsigned dictID = DEFAULT_DICTID;
   unsigned maxDictSize = g_defaultMaxDictSize;
 
+  /* Initialize table to store input files */
   const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
   unsigned filenameIdx = 0;
 
+  /* Parse arguments */
   for (int i = 1; i < argCount; i++) {
     const char* argument = argv[i];
     if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; }
     if (longCommandWArg(&argument, "in=")) {
-      inputFile = malloc(strlen(argument) + 1);
-      strcpy(inputFile, argument);
-      filenameTable[filenameIdx] = inputFile;
+      filenameTable[filenameIdx] = argument;
       filenameIdx++;
       continue;
     }
     if (longCommandWArg(&argument, "out=")) {
-      outputFile = malloc(strlen(argument) + 1);
-      strcpy(outputFile, argument);
+      outputFile = argument;
       continue;
     }
     DISPLAYLEVEL(1, "Incorrect parameters\n");
@@ -168,7 +166,7 @@ int main(int argCount, const char* argv[])
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
-  int followLinks = 0;
+  int followLinks = 0; /* follow directory recursively */
   const char** extendedFileList = NULL;
   extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
                                         &fileNamesNb, followLinks);
@@ -194,5 +192,9 @@ int main(int argCount, const char* argv[])
                     filenameIdx, blockSize, maxDictSize, zParams.notificationLevel);
   operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, &params);
 
+  /* Free allocated memory */
+  UTIL_freeFileList(extendedFileList, fileNamesBuf);
+  freeSampleInfo(info);
+
   return operationResult;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index cfed14a4..34aec39e 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -113,15 +113,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *
 }
 
 
-/*! ZDICT_trainFromBuffer_random():
- *  Train a dictionary from an array of samples using the RANDOM algorithm.
- *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
- *  supplied with an array of sizes `samplesSizes`, providing the size of each
- *  sample, in order.
- *  The resulting dictionary will be saved into `dictBuffer`.
- * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *          or an error code, which can be tested with ZDICT_isError().
- */
+
+
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random(
     void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index b6696323..c3146f86 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -23,7 +23,14 @@ typedef struct {
 } ZDICT_random_params_t;
 
 
-
+/*! ZDICT_trainFromBuffer_random():
+ *  Train a dictionary from an array of samples.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ */
 ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity,
     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
     ZDICT_random_params_t parameters);

From ce09fb723d1311e62c920430fb14634e9b67dd70 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Tue, 17 Jul 2018 16:13:40 -0700
Subject: [PATCH 10/13] Update freeSampleInfo

---
 contrib/randomDictBuilder/io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c
index 1c3eda58..67c40858 100644
--- a/contrib/randomDictBuilder/io.c
+++ b/contrib/randomDictBuilder/io.c
@@ -244,6 +244,7 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t
 
 
 void freeSampleInfo(sampleInfo *info) {
+    if (!info) return;
     if (info->samplesSizes) free((void*)(info->samplesSizes));
     if (info->srcBuffer) free((void*)(info->srcBuffer));
     free(info);

From 52e7cf0e405ac6eb827322b607d094125646bbfb Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 18 Jul 2018 10:40:13 -0700
Subject: [PATCH 11/13] Add cleanup to trainfromFiles and move RANDOM_segment_t
 declaration

---
 contrib/randomDictBuilder/main.c   | 3 ++-
 contrib/randomDictBuilder/random.c | 9 +++++++++
 contrib/randomDictBuilder/random.h | 7 -------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 1f12c7a4..36c4326b 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             result = 1;
-            free(dictBuffer);
+            goto _cleanup;
         }
         /* save dict */
         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
@@ -122,6 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     }
 
     /* clean up */
+_cleanup:
     free(dictBuffer);
     return result;
 }
diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c
index 34aec39e..5276bea9 100644
--- a/contrib/randomDictBuilder/random.c
+++ b/contrib/randomDictBuilder/random.c
@@ -47,6 +47,15 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) {
 }
 
 
+/**
+ * A segment is an inclusive range in the source.
+ */
+typedef struct {
+  U32 begin;
+  U32 end;
+} RANDOM_segment_t;
+
+
 /**
  * Selects a random segment from totalSamplesSize - k + 1 possible segments
  */
diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h
index c3146f86..352775f9 100644
--- a/contrib/randomDictBuilder/random.h
+++ b/contrib/randomDictBuilder/random.h
@@ -8,13 +8,6 @@
 #endif
 #include "zdict.h"
 
-/**
- * A segment is an inclusive range in the source.
- */
-typedef struct {
-  U32 begin;
-  U32 end;
-} RANDOM_segment_t;
 
 
 typedef struct {

From 5bb46a898e6565e5bc1ee861999384f806f83831 Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Wed, 18 Jul 2018 12:15:49 -0700
Subject: [PATCH 12/13] Rename cleanup

---
 contrib/randomDictBuilder/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c
index 36c4326b..4751a9e1 100644
--- a/contrib/randomDictBuilder/main.c
+++ b/contrib/randomDictBuilder/main.c
@@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             result = 1;
-            goto _cleanup;
+            goto _done;
         }
         /* save dict */
         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
@@ -122,7 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info,
     }
 
     /* clean up */
-_cleanup:
+_done:
     free(dictBuffer);
     return result;
 }

From 0c5eaef248443342dd1cd19f5e434334bef6fc4c Mon Sep 17 00:00:00 2001
From: Jennifer Liu <jenniferliu620@fb.com>
Date: Thu, 19 Jul 2018 13:44:27 -0700
Subject: [PATCH 13/13] Update Makefile

---
 contrib/randomDictBuilder/Makefile | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile
index 678ff28a..5f9240bf 100644
--- a/contrib/randomDictBuilder/Makefile
+++ b/contrib/randomDictBuilder/Makefile
@@ -1,8 +1,11 @@
-PROGRAM_FILES := ../../programs/fileio.c
+ARG :=
+
+CC ?= gcc
+CFLAGS ?= -O3
+INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
 
 TEST_INPUT := ../../lib
 TEST_OUTPUT := randomDict
-ARG :=
 
 all: main run clean
 
@@ -15,16 +18,16 @@ run:
 	./main $(ARG)
 
 main: main.o io.o random.o libzstd.a
-	gcc main.o io.o random.o libzstd.a -o main
+	$(CC) $(CFLAGS) main.o io.o random.o libzstd.a -o main
 
 main.o: main.c
-	gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+	$(CC) $(CFLAGS) $(INCLUDES) -c main.c
 
 random.o: random.c
-	gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder
+	$(CC) $(CFLAGS) $(INCLUDES) -c random.c
 
-io.o: io.c $(PROGRAM_FILES)
-	gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+io.o: io.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c io.c
 
 libzstd.a:
 	$(MAKE) -C ../../lib libzstd.a
@@ -44,8 +47,6 @@ testshell: test.sh
 
 .PHONY: clean
 clean:
-	rm -f libzstd.a main
-	rm -f ../../lib/*/*.o
-	rm -f ../../programs/*.o
-	rm -f *.o
+	rm -f *.o main libzstd.a
+	$(MAKE) -C ../../lib clean
 	echo "Cleaning is completed"