From a23a3b95f9c00ecf52216bd7fe768e41eac4e269 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 16:05:14 -0700 Subject: [PATCH 01/13] Add random dictionary builder --- contrib/randomDictBuilder/Makefile | 48 +++ contrib/randomDictBuilder/README.md | 13 + contrib/randomDictBuilder/main.c | 125 ++++++++ contrib/randomDictBuilder/random.c | 455 ++++++++++++++++++++++++++++ contrib/randomDictBuilder/random.h | 53 ++++ contrib/randomDictBuilder/test.sh | 14 + 6 files changed, 708 insertions(+) create mode 100644 contrib/randomDictBuilder/Makefile create mode 100644 contrib/randomDictBuilder/README.md create mode 100644 contrib/randomDictBuilder/main.c create mode 100644 contrib/randomDictBuilder/random.c create mode 100644 contrib/randomDictBuilder/random.h create mode 100644 contrib/randomDictBuilder/test.sh diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile new file mode 100644 index 00000000..a2aade23 --- /dev/null +++ b/contrib/randomDictBuilder/Makefile @@ -0,0 +1,48 @@ +PROGRAM_FILES := ../../programs/fileio.c + +TEST_INPUT := ../../lib +TEST_OUTPUT := randomDict +ARG := + +all: main testrun test clean + +run: main rand clean + +.PHONY: rand +rand: + echo "Building a random dictionary with given arguments" + ./main $(ARG) + + +main: random.o main.o libzstd.a + gcc random.o main.o libzstd.a -o main + +main.o: main.c + gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h + +random.o: $(PROGRAM_FILES) random.c + gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c + +libzstd.a: + $(MAKE) -C ../../lib libzstd.a + mv ../../lib/libzstd.a . + +.PHONY: testrun +testrun: main + echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) " + ./main in=$(TEST_INPUT) out=$(TEST_OUTPUT) + zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q + rm -f $(TEST_OUTPUT) + +.PHONY: test +test: test.sh + sh test.sh + echo "Finish running test.sh" + +.PHONY: clean +clean: + rm -f libzstd.a main + rm -f ../../lib/*/*.o + rm -f ../../programs/*.o + rm -f *.o + echo "Cleaning is completed" diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md new file mode 100644 index 00000000..cadffdf2 --- /dev/null +++ b/contrib/randomDictBuilder/README.md @@ -0,0 +1,13 @@ +Random Dictionary Builder + +### Permitted Arguments: +Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required +Output Dictionary (out=dictName): if not provided, default to defaultDict +Dictionary ID (dictID=#): positive number, if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB +Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200 +Compression Level (c=#): positive number, if not provided, default to 3 + +### Examples: +make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" +make run ARG="in=../../lib/dictBuilder in=../../lib/compress" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c new file mode 100644 index 00000000..15eb5c44 --- /dev/null +++ b/contrib/randomDictBuilder/main.c @@ -0,0 +1,125 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "random.h" +#include "util.h" + +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_INPUTFILE "" +#define DEFAULT_k 200 +#define DEFAULT_OUTPUTFILE "defaultDict" +#define DEFAULT_DICTID 0 + + +static unsigned readU32FromChar(const char** stringPtr) +{ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) +{ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + unsigned cLevel = DEFAULT_CLEVEL; + char* inputFile = DEFAULT_INPUTFILE; + unsigned k = DEFAULT_k; + char* outputFile = DEFAULT_OUTPUTFILE; + unsigned dictID = DEFAULT_DICTID; + unsigned maxDictSize = g_defaultMaxDictSize; + + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + inputFile = malloc(strlen(argument) + 1); + strcpy(inputFile, argument); + filenameTable[filenameIdx] = inputFile; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = malloc(strlen(argument) + 1); + strcpy(outputFile, argument); + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; + const char** extendedFileList = NULL; + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "random.h" +#include "platform.h" /* Large Files support */ +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ + +/*-************************************* +* Constants +***************************************/ +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 +#define DEFAULT_K 200 + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + + + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + + +/* ******************************************************** +* Random Dictionary Builder +**********************************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/** + * Selects a random segment from totalSamplesSize - k + 1 possible segments + */ +static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, + ZDICT_random_params_t parameters) { + const U32 k = parameters.k; + RANDOM_segment_t segment; + unsigned index; + + /* Seed random number generator */ + srand((unsigned)time(NULL)); + /* Randomly generate a number from 0 to sampleSizes - k */ + index = rand()%(ctx->totalSamplesSize - k + 1); + + /* inclusive */ + segment.begin = index; + segment.end = index + k - 1; + + return segment; +} + + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) { + /* k is a required parameter */ + if (parameters.k == 0) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `RANDOM_ctx_init()`. + */ +static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + + +/** + * Prepare a context for dictionary building. + * Returns 1 on success or zero on error. + * The context must be destroyed with `RANDOM_ctx_destroy()`. + */ +static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const int displayLevel = 2; + /* Checks */ + if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, + (U32)totalSamplesSize); + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + ctx->totalSamplesSize = (U32)totalSamplesSize; + if (!ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); + RANDOM_ctx_destroy(ctx); + return 0; + } + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, + size_t dictBufferCapacity, + ZDICT_random_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + const int displayLevel = parameters.zParams.notificationLevel; + while (tail > 0) { + + /* Select a segment */ + RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + + size_t segmentSize; + segmentSize = MIN(segment.end - segment.begin + 1, tail); + + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + + return tail; +} + +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples using the RANDOM algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters) { + const int displayLevel = parameters.zParams.notificationLevel; + BYTE* const dict = (BYTE*)dictBuffer; + RANDOM_ctx_t ctx; + /* Checks */ + if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "k is incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Random must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + + if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { + return ERROR(GENERIC); + } + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t dictSize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbSamples, parameters.zParams); + if (!ZSTD_isError(dictSize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictSize); + } + RANDOM_ctx_destroy(&ctx); + return dictSize; + } +} + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + int result = 0; + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) + EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, + sampleSizes, fs.nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _cleanup; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_cleanup: + free(srcBuffer); + free(sampleSizes); + free(dictBuffer); + return result; +} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h new file mode 100644 index 00000000..05879641 --- /dev/null +++ b/contrib/randomDictBuilder/random.h @@ -0,0 +1,53 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + +/************************************** +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + U32 totalSamplesSize; +} RANDOM_ctx_t; + +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */ + ZDICT_params_t zParams; +} ZDICT_random_params_t; + + +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + + +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters); + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params); diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh new file mode 100644 index 00000000..552650ee --- /dev/null +++ b/contrib/randomDictBuilder/test.sh @@ -0,0 +1,14 @@ +echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1" +./main c=5 in=../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../lib/common -q +echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../lib/common -q +echo "Building random dictionary with 2 sample sources" +./main in=../../lib/common in=../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../lib/common -q +echo "Removing dict1 dict2 dict3" +rm -f dict1 dict2 dict3 + +echo "Testing with invalid parameters, should fail" +! ./main r=10 From 31731df4dab0df7b465de2de5641b2e3416c9086 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 17:38:53 -0700 Subject: [PATCH 02/13] Remove clevel and update documentation --- contrib/randomDictBuilder/README.md | 15 ++++++++++----- contrib/randomDictBuilder/main.c | 11 ++++++++--- contrib/randomDictBuilder/test.sh | 8 ++++---- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index cadffdf2..de2c7ff6 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -1,12 +1,17 @@ Random Dictionary Builder ### Permitted Arguments: -Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" Output Dictionary (out=dictName): if not provided, default to defaultDict -Dictionary ID (dictID=#): positive number, if not provided, default to 0 -Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB -Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200 -Compression Level (c=#): positive number, if not provided, default to 3 +Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB +Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 +Compression Level (c=#): positive number; if not provided, default to 3 + + +###Usage: +To build a random dictionary with the provided arguments: make run ARG= followed by arguments + ### Examples: make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 15eb5c44..cf0b9476 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -63,7 +63,7 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - unsigned cLevel = DEFAULT_CLEVEL; + /* Initialize parameters with default value */ char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -76,10 +76,10 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } - if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { + /* Allow multiple input files */ inputFile = malloc(strlen(argument) + 1); strcpy(inputFile, argument); filenameTable[filenameIdx] = inputFile; @@ -96,6 +96,11 @@ int main(int argCount, const char* argv[]) return operationResult; } + if (maxDictSize == 0) { + DISPLAYLEVEL(1, "maxDictSize should not be 0.\n"); + operationResult = 1; + return operationResult; + } char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; @@ -114,7 +119,7 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = cLevel; + zParams.compressionLevel = DEFAULT_CLEVEL; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh index 552650ee..497820f8 100644 --- a/contrib/randomDictBuilder/test.sh +++ b/contrib/randomDictBuilder/test.sh @@ -1,8 +1,8 @@ -echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1" -./main c=5 in=../../lib/common k=200 out=dict1 +echo "Building random dictionary with in=../../lib/common k=200 out=dict1" +./main in=../../lib/common k=200 out=dict1 zstd -be3 -D dict1 -r ../../lib/common -q -echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" -./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 zstd -be3 -D dict2 -r ../../lib/common -q echo "Building random dictionary with 2 sample sources" ./main in=../../lib/common in=../../lib/compress out=dict3 From 0e5fbc10facdce2def08e4f4ecb67d255694df3a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Fri, 13 Jul 2018 17:41:09 -0700 Subject: [PATCH 03/13] Update README --- contrib/randomDictBuilder/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index de2c7ff6..09f1e808 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -6,7 +6,6 @@ Output Dictionary (out=dictName): if not provided, default to defaultDict Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 -Compression Level (c=#): positive number; if not provided, default to 3 ###Usage: From b5806d33db813dfb2bac7cd3b97b5bcf09ee57b7 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 16 Jul 2018 16:03:04 -0700 Subject: [PATCH 04/13] Refactor RANDOM --- contrib/randomDictBuilder/Makefile | 12 +- contrib/randomDictBuilder/main.c | 297 ++++++++++++++++++++++++- contrib/randomDictBuilder/random.c | 343 ++--------------------------- contrib/randomDictBuilder/random.h | 23 -- 4 files changed, 314 insertions(+), 361 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index a2aade23..443f6f04 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -14,14 +14,14 @@ rand: ./main $(ARG) -main: random.o main.o libzstd.a - gcc random.o main.o libzstd.a -o main +main: main.o random.o libzstd.a + gcc main.o random.o libzstd.a -o main -main.o: main.c - gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h +main.o: main.c $(PROGRAM_FILES) + gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -random.o: $(PROGRAM_FILES) random.c - gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c +random.o: random.c + gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder libzstd.a: $(MAKE) -C ../../lib libzstd.a diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index cf0b9476..d9295aa9 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -3,13 +3,45 @@ #include /* strcmp, strlen */ #include /* errno */ #include -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ #include "random.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ #include "util.h" +#include "zdict.h" +/*-************************************* +* Console display +***************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + +/*-************************************* +* Constants +***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_CLEVEL 3 #define DEFAULT_INPUTFILE "" @@ -17,7 +49,33 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); +#define NOISELENGTH 32 + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +/*-************************************* +* Commandline related functions +***************************************/ static unsigned readU32FromChar(const char** stringPtr) { const char errorMsg[] = "error: numeric value too large"; @@ -42,7 +100,6 @@ static unsigned readU32FromChar(const char** stringPtr) return result; } - /** longCommandWArg() : * check if *stringPtr is the same as longCommand. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. @@ -56,6 +113,225 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + free(dictBuffer); + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ + free(dictBuffer); + return result; +} + +sampleInfo* getSampleInfo(const char** fileNamesTable, + unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){ + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} + + int main(int argCount, const char* argv[]) { @@ -63,7 +339,7 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - /* Initialize parameters with default value */ + unsigned cLevel = DEFAULT_CLEVEL; char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -76,10 +352,10 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { - /* Allow multiple input files */ inputFile = malloc(strlen(argument) + 1); strcpy(inputFile, argument); filenameTable[filenameIdx] = inputFile; @@ -96,12 +372,6 @@ int main(int argCount, const char* argv[]) return operationResult; } - if (maxDictSize == 0) { - DISPLAYLEVEL(1, "maxDictSize should not be 0.\n"); - operationResult = 1; - return operationResult; - } - char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; int followLinks = 0; @@ -119,12 +389,15 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = DEFAULT_CLEVEL; + zParams.compressionLevel = cLevel; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; params.k = k; - operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, ¶ms); + sampleInfo* info= getSampleInfo(filenameTable, + filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); + operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); + return operationResult; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index a59427ba..96c02389 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -5,24 +5,12 @@ #include /* malloc, free, qsort */ #include /* memset */ #include /* clock */ -#include "zstd_internal.h" /* includes zstd.h */ +#include "random.h" +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif -#include "random.h" -#include "platform.h" /* Large Files support */ -#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ - -/*-************************************* -* Constants -***************************************/ -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 -#define DEFAULT_K 200 +#include "zdict.h" /*-************************************* * Console display @@ -30,179 +18,16 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((siz #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } -static const U64 g_refreshRate = SEC_TO_MICRO / 6; -static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; - -#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ - if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ - { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ - if (displayLevel>=4) fflush(stderr); } } } - - -/*-************************************* -* Exceptions -***************************************/ -#ifndef DEBUG -# define DEBUG 0 -#endif -#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); -#define EXM_THROW(error, ...) \ -{ \ - DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ - DISPLAY("Error %i : ", error); \ - DISPLAY(__VA_ARGS__); \ - DISPLAY("\n"); \ - exit(error); \ -} - - -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, - size_t* sampleSizes, unsigned sstSize, - const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, - unsigned displayLevel) -{ - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - - - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} - - - -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) -{ - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) -{ - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) -{ - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - - +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; @@ -225,16 +50,14 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { /** * Selects a random segment from totalSamplesSize - k + 1 possible segments */ -static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, +static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize, ZDICT_random_params_t parameters) { const U32 k = parameters.k; RANDOM_segment_t segment; unsigned index; - /* Seed random number generator */ - srand((unsigned)time(NULL)); /* Randomly generate a number from 0 to sampleSizes - k */ - index = rand()%(ctx->totalSamplesSize - k + 1); + index = rand()%(totalSamplesSize - k + 1); /* inclusive */ segment.begin = index; @@ -261,65 +84,11 @@ static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDi } -/** - * Clean up a context initialized with `RANDOM_ctx_init()`. - */ -static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { - if (!ctx) { - return; - } - if (ctx->offsets) { - free(ctx->offsets); - ctx->offsets = NULL; - } -} - - -/** - * Prepare a context for dictionary building. - * Returns 1 on success or zero on error. - * The context must be destroyed with `RANDOM_ctx_destroy()`. - */ -static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples) { - const BYTE *const samples = (const BYTE *)samplesBuffer; - const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); - const int displayLevel = 2; - /* Checks */ - if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { - DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", - (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); - return 0; - } - memset(ctx, 0, sizeof(*ctx)); - DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, - (U32)totalSamplesSize); - ctx->samples = samples; - ctx->samplesSizes = samplesSizes; - ctx->nbSamples = nbSamples; - ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); - ctx->totalSamplesSize = (U32)totalSamplesSize; - if (!ctx->offsets) { - DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); - RANDOM_ctx_destroy(ctx); - return 0; - } - { - U32 i; - ctx->offsets[0] = 0; - for (i = 1; i <= nbSamples; ++i) { - ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; - } - } - return 1; -} - - /** * Given the prepared context build the dictionary. */ -static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, - size_t dictBufferCapacity, +static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples, + void *dictBuffer, size_t dictBufferCapacity, ZDICT_random_params_t parameters) { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; @@ -327,13 +96,13 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, while (tail > 0) { /* Select a segment */ - RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters); size_t segmentSize; segmentSize = MIN(segment.end - segment.begin + 1, tail); tail -= segmentSize; - memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + memcpy(dict + tail, samples + segment.begin, segmentSize); DISPLAYUPDATE( 2, "\r%u%% ", (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); @@ -342,6 +111,7 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, return tail; } + /*! ZDICT_trainFromBuffer_random(): * Train a dictionary from an array of samples using the RANDOM algorithm. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, @@ -356,7 +126,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_random_params_t parameters) { const int displayLevel = parameters.zParams.notificationLevel; BYTE* const dict = (BYTE*)dictBuffer; - RANDOM_ctx_t ctx; /* Checks */ if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "k is incorrect\n"); @@ -371,13 +140,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_DICTSIZE_MIN); return ERROR(dstSize_tooSmall); } + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const BYTE *const samples = (const BYTE *)samplesBuffer; - if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { - return ERROR(GENERIC); - } DISPLAYLEVEL(2, "Building dictionary\n"); { - const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters); const size_t dictSize = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, samplesBuffer, samplesSizes, nbSamples, parameters.zParams); @@ -385,71 +153,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)dictSize); } - RANDOM_ctx_destroy(&ctx); return dictSize; } } - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params){ - unsigned const displayLevel = params->zParams.notificationLevel; - void* const dictBuffer = malloc(maxDictSize); - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - int result = 0; - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) - EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); - - { size_t dictSize; - dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, - sampleSizes, fs.nbSamples, *params); - DISPLAYLEVEL(2, "k=%u\n", params->k); - if (ZDICT_isError(dictSize)) { - DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ - result = 1; - goto _cleanup; - } - /* save dict */ - DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - saveDict(dictFileName, dictBuffer, dictSize); - } - - /* clean up */ -_cleanup: - free(srcBuffer); - free(sampleSizes); - free(dictBuffer); - return result; -} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index 05879641..77529daf 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -8,18 +8,6 @@ #endif #include "zdict.h" - -/************************************** -* Context -***************************************/ -typedef struct { - const BYTE *samples; - size_t *offsets; - const size_t *samplesSizes; - size_t nbSamples; - U32 totalSamplesSize; -} RANDOM_ctx_t; - /** * A segment is an inclusive range in the source. */ @@ -35,19 +23,8 @@ typedef struct { } ZDICT_random_params_t; -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_random_params_t parameters); - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params); From 1f7fa5cdd6555e22dfa8c2dc1f5c17293e703fe3 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 16 Jul 2018 16:31:59 -0700 Subject: [PATCH 05/13] Fix spacing and Edit Makefile (now run with make instead of make run) --- contrib/randomDictBuilder/Makefile | 13 +++++---- contrib/randomDictBuilder/README.md | 9 ++++--- contrib/randomDictBuilder/main.c | 42 ++++++++++++++--------------- contrib/randomDictBuilder/random.c | 9 ++++--- contrib/randomDictBuilder/random.h | 5 ++-- 5 files changed, 40 insertions(+), 38 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 443f6f04..77dd2933 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -4,16 +4,15 @@ TEST_INPUT := ../../lib TEST_OUTPUT := randomDict ARG := -all: main testrun test clean +all: main run clean -run: main rand clean +test: main testrun testshell clean -.PHONY: rand -rand: +.PHONY: run +run: echo "Building a random dictionary with given arguments" ./main $(ARG) - main: main.o random.o libzstd.a gcc main.o random.o libzstd.a -o main @@ -34,8 +33,8 @@ testrun: main zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q rm -f $(TEST_OUTPUT) -.PHONY: test -test: test.sh +.PHONY: testshell +testshell: test.sh sh test.sh echo "Finish running test.sh" diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md index 09f1e808..0e70d3dc 100644 --- a/contrib/randomDictBuilder/README.md +++ b/contrib/randomDictBuilder/README.md @@ -7,11 +7,14 @@ Dictionary ID (dictID=#): nonnegative number; if not provided, default to 0 Maximum Dictionary Size (maxdict=#): positive number; in bytes, if not provided, default to 110KB Size of Randomly Selected Segment (k=#): positive number; in bytes; if not provided, default to 200 +###Running Test: +make test + ###Usage: -To build a random dictionary with the provided arguments: make run ARG= followed by arguments +To build a random dictionary with the provided arguments: make ARG= followed by arguments ### Examples: -make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" -make run ARG="in=../../lib/dictBuilder in=../../lib/compress" +make ARG="in=../../lib/dictBuilder out=dict100 dictID=520" +make ARG="in=../../lib/dictBuilder in=../../lib/compress" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index d9295aa9..e195188b 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -52,7 +52,8 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define SAMPLESIZE_MAX (128 KB) #define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) #define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); #define NOISELENGTH 32 @@ -76,8 +77,7 @@ typedef struct { /*-************************************* * Commandline related functions ***************************************/ -static unsigned readU32FromChar(const char** stringPtr) -{ +static unsigned readU32FromChar(const char** stringPtr){ const char errorMsg[] = "error: numeric value too large"; unsigned result = 0; while ((**stringPtr >='0') && (**stringPtr <='9')) { @@ -105,8 +105,7 @@ static unsigned readU32FromChar(const char** stringPtr) * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. * @return 0 and doesn't modify *stringPtr otherwise. */ -static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) -{ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ size_t const comSize = strlen(longCommand); int const result = !strncmp(*stringPtr, longCommand, comSize); if (result) *stringPtr += comSize; @@ -125,11 +124,9 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) * *bufferSizePtr is modified, it provides the amount data loaded within buffer. * sampleSizes is filled with the size of each sample. */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, - size_t* sampleSizes, unsigned sstSize, - const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, - unsigned displayLevel) -{ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { char* const buff = (char*)buffer; size_t pos = 0; unsigned nbLoadedChunks = 0, fileIndex; @@ -200,8 +197,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) { /*-******************************************************** * Dictionary training functions **********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) -{ +static size_t findMaxMem(unsigned long long requiredMem) { size_t const step = 8 MB; void* testmem = NULL; @@ -219,8 +215,7 @@ static size_t findMaxMem(unsigned long long requiredMem) } static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) -{ + const void* buff, size_t buffSize) { FILE* const f = fopen(dictFileName, "wb"); if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); @@ -236,8 +231,8 @@ static void saveDict(const char* dictFileName, * provides the amount of data to be loaded and the resulting nb of samples. * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) -{ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { fileStats fs; unsigned n; memset(&fs, 0, sizeof(fs)); @@ -255,8 +250,9 @@ static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, siz return fs; } -int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, - ZDICT_random_params_t *params){ +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, + unsigned maxDictSize, + ZDICT_random_params_t *params) { unsigned const displayLevel = params->zParams.notificationLevel; void* const dictBuffer = malloc(maxDictSize); @@ -285,8 +281,8 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned m return result; } -sampleInfo* getSampleInfo(const char** fileNamesTable, - unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){ +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); size_t const memMult = RANDOM_MEMMULT; @@ -320,7 +316,8 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, /* Load input buffer */ DISPLAYLEVEL(3, "Shuffling input files\n"); shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); @@ -376,7 +373,8 @@ int main(int argCount, const char* argv[]) unsigned fileNamesNb = filenameIdx; int followLinks = 0; const char** extendedFileList = NULL; - extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); if (extendedFileList) { unsigned u; for (u=0; u Date: Mon, 16 Jul 2018 18:59:18 -0700 Subject: [PATCH 06/13] Remove CLevel cli option which was accidentally added back in the last commit --- contrib/randomDictBuilder/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index e195188b..e66f2847 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -336,7 +336,6 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - unsigned cLevel = DEFAULT_CLEVEL; char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -349,7 +348,6 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } - if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { @@ -387,7 +385,7 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = cLevel; + zParams.compressionLevel = DEFAULT_CLEVEL; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; From 49acfaeaec44a25c4628a2512965445152e8776a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 12:35:09 -0700 Subject: [PATCH 07/13] Move file loading functions to new file for access by benchmarking tool --- contrib/randomDictBuilder/Makefile | 11 +- contrib/randomDictBuilder/io.c | 243 +++++++++++++++++++++++++++++ contrib/randomDictBuilder/io.h | 33 ++++ contrib/randomDictBuilder/main.c | 215 +------------------------ 4 files changed, 290 insertions(+), 212 deletions(-) create mode 100644 contrib/randomDictBuilder/io.c create mode 100644 contrib/randomDictBuilder/io.h diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 77dd2933..8360a409 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -13,15 +13,18 @@ run: echo "Building a random dictionary with given arguments" ./main $(ARG) -main: main.o random.o libzstd.a - gcc main.o random.o libzstd.a -o main +main: main.o io.o random.o libzstd.a + gcc main.o io.o random.o libzstd.a -o main -main.o: main.c $(PROGRAM_FILES) - gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +main.o: main.c + gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder random.o: random.c gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder +io.o: io.c $(PROGRAM_FILES) + gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + libzstd.a: $(MAKE) -C ../../lib libzstd.a mv ../../lib/libzstd.a . diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c new file mode 100644 index 00000000..a5f71498 --- /dev/null +++ b/contrib/randomDictBuilder/io.c @@ -0,0 +1,243 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "io.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ + +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 + + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) { + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) { + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h new file mode 100644 index 00000000..4b5639fe --- /dev/null +++ b/contrib/randomDictBuilder/io.h @@ -0,0 +1,33 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "zstd_internal.h" /* includes zstd.h */ +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel); + + +void saveDict(const char* dictFileName, const void* buff, size_t buffSize); diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index e66f2847..34a9d99e 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -4,11 +4,11 @@ #include /* errno */ #include #include "random.h" -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ -#include "platform.h" /* Large Files support */ +#include "io.h" #include "util.h" #include "zdict.h" + /*-************************************* * Console display ***************************************/ @@ -23,6 +23,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ if (displayLevel>=4) fflush(stderr); } } } + /*-************************************* * Exceptions ***************************************/ @@ -39,6 +40,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; exit(error); \ } + /*-************************************* * Constants ***************************************/ @@ -49,29 +51,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? - (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 - - -/*-************************************* -* Structs -***************************************/ -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - -typedef struct { - const void* srcBuffer; - const size_t *samplesSizes; - size_t nbSamples; -}sampleInfo; /*-************************************* @@ -112,144 +91,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, - unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, - size_t targetChunkSize, unsigned displayLevel) { - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) { - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) { - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, unsigned displayLevel) { - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - +/*-************************************* +* RANDOM +***************************************/ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, ZDICT_random_params_t *params) { @@ -281,53 +127,6 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, return result; } -sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, - unsigned maxDictSize, const unsigned displayLevel) { - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer)) - EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, - fileNamesTable, nbFiles, chunkSize, displayLevel); - - sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); - - info->nbSamples = fs.nbSamples; - info->samplesSizes = sampleSizes; - info->srcBuffer = srcBuffer; - - return info; -} - int main(int argCount, const char* argv[]) From e6fe4058388c820444a80d9d10aa5d840fab3c0c Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 12:42:53 -0700 Subject: [PATCH 08/13] Make test PHONY target --- contrib/randomDictBuilder/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 8360a409..678ff28a 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -6,6 +6,7 @@ ARG := all: main run clean +.PHONY: test test: main testrun testshell clean .PHONY: run From 896ff0644a2531a22edf78ea9cb6b58a4de9c77f Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 16:01:44 -0700 Subject: [PATCH 09/13] Fix deallocation problem and add documentation --- contrib/randomDictBuilder/io.c | 7 +++++++ contrib/randomDictBuilder/io.h | 17 +++++++++++++++++ contrib/randomDictBuilder/main.c | 20 +++++++++++--------- contrib/randomDictBuilder/random.c | 11 ++--------- contrib/randomDictBuilder/random.h | 9 ++++++++- 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c index a5f71498..1c3eda58 100644 --- a/contrib/randomDictBuilder/io.c +++ b/contrib/randomDictBuilder/io.c @@ -241,3 +241,10 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t return info; } + + +void freeSampleInfo(sampleInfo *info) { + if (info->samplesSizes) free((void*)(info->samplesSizes)); + if (info->srcBuffer) free((void*)(info->srcBuffer)); + free(info); +} diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h index 4b5639fe..55967f76 100644 --- a/contrib/randomDictBuilder/io.h +++ b/contrib/randomDictBuilder/io.h @@ -26,8 +26,25 @@ typedef struct { }sampleInfo; + +/*! getSampleInfo(): + * Load from input files and add samples to buffer + * @return: a sampleInfo struct containing infomation about buffer where samples are stored, + * size of each sample, and total number of samples + */ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel); + +/*! freeSampleInfo(): + * Free memory allocated for info + */ +void freeSampleInfo(sampleInfo *info); + + + +/*! saveDict(): + * Save data stored on buff to dictFileName + */ void saveDict(const char* dictFileName, const void* buff, size_t buffSize); diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 34a9d99e..1f12c7a4 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -46,7 +46,6 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; ***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_CLEVEL 3 -#define DEFAULT_INPUTFILE "" #define DEFAULT_k 200 #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 @@ -135,30 +134,29 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - char* inputFile = DEFAULT_INPUTFILE; + /* Initialize arguments to default values */ unsigned k = DEFAULT_k; - char* outputFile = DEFAULT_OUTPUTFILE; + const char* outputFile = DEFAULT_OUTPUTFILE; unsigned dictID = DEFAULT_DICTID; unsigned maxDictSize = g_defaultMaxDictSize; + /* Initialize table to store input files */ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); unsigned filenameIdx = 0; + /* Parse arguments */ for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { - inputFile = malloc(strlen(argument) + 1); - strcpy(inputFile, argument); - filenameTable[filenameIdx] = inputFile; + filenameTable[filenameIdx] = argument; filenameIdx++; continue; } if (longCommandWArg(&argument, "out=")) { - outputFile = malloc(strlen(argument) + 1); - strcpy(outputFile, argument); + outputFile = argument; continue; } DISPLAYLEVEL(1, "Incorrect parameters\n"); @@ -168,7 +166,7 @@ int main(int argCount, const char* argv[]) char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; - int followLinks = 0; + int followLinks = 0; /* follow directory recursively */ const char** extendedFileList = NULL; extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); @@ -194,5 +192,9 @@ int main(int argCount, const char* argv[]) filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); + /* Free allocated memory */ + UTIL_freeFileList(extendedFileList, fileNamesBuf); + freeSampleInfo(info); + return operationResult; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index cfed14a4..34aec39e 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -113,15 +113,8 @@ static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE * } -/*! ZDICT_trainFromBuffer_random(): - * Train a dictionary from an array of samples using the RANDOM algorithm. - * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each - * sample, in order. - * The resulting dictionary will be saved into `dictBuffer`. - * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) - * or an error code, which can be tested with ZDICT_isError(). - */ + + ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index b6696323..c3146f86 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -23,7 +23,14 @@ typedef struct { } ZDICT_random_params_t; - +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_random_params_t parameters); From ce09fb723d1311e62c920430fb14634e9b67dd70 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 16:13:40 -0700 Subject: [PATCH 10/13] Update freeSampleInfo --- contrib/randomDictBuilder/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c index 1c3eda58..67c40858 100644 --- a/contrib/randomDictBuilder/io.c +++ b/contrib/randomDictBuilder/io.c @@ -244,6 +244,7 @@ sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t void freeSampleInfo(sampleInfo *info) { + if (!info) return; if (info->samplesSizes) free((void*)(info->samplesSizes)); if (info->srcBuffer) free((void*)(info->srcBuffer)); free(info); From 52e7cf0e405ac6eb827322b607d094125646bbfb Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 18 Jul 2018 10:40:13 -0700 Subject: [PATCH 11/13] Add cleanup to trainfromFiles and move RANDOM_segment_t declaration --- contrib/randomDictBuilder/main.c | 3 ++- contrib/randomDictBuilder/random.c | 9 +++++++++ contrib/randomDictBuilder/random.h | 7 ------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 1f12c7a4..36c4326b 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ result = 1; - free(dictBuffer); + goto _cleanup; } /* save dict */ DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); @@ -122,6 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, } /* clean up */ +_cleanup: free(dictBuffer); return result; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index 34aec39e..5276bea9 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -47,6 +47,15 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { } +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + /** * Selects a random segment from totalSamplesSize - k + 1 possible segments */ diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index c3146f86..352775f9 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -8,13 +8,6 @@ #endif #include "zdict.h" -/** - * A segment is an inclusive range in the source. - */ -typedef struct { - U32 begin; - U32 end; -} RANDOM_segment_t; typedef struct { From 5bb46a898e6565e5bc1ee861999384f806f83831 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Wed, 18 Jul 2018 12:15:49 -0700 Subject: [PATCH 12/13] Rename cleanup --- contrib/randomDictBuilder/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index 36c4326b..4751a9e1 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -114,7 +114,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ result = 1; - goto _cleanup; + goto _done; } /* save dict */ DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); @@ -122,7 +122,7 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, } /* clean up */ -_cleanup: +_done: free(dictBuffer); return result; } From 0c5eaef248443342dd1cd19f5e434334bef6fc4c Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Thu, 19 Jul 2018 13:44:27 -0700 Subject: [PATCH 13/13] Update Makefile --- contrib/randomDictBuilder/Makefile | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 678ff28a..5f9240bf 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -1,8 +1,11 @@ -PROGRAM_FILES := ../../programs/fileio.c +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder TEST_INPUT := ../../lib TEST_OUTPUT := randomDict -ARG := all: main run clean @@ -15,16 +18,16 @@ run: ./main $(ARG) main: main.o io.o random.o libzstd.a - gcc main.o io.o random.o libzstd.a -o main + $(CC) $(CFLAGS) main.o io.o random.o libzstd.a -o main main.o: main.c - gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + $(CC) $(CFLAGS) $(INCLUDES) -c main.c random.o: random.c - gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder + $(CC) $(CFLAGS) $(INCLUDES) -c random.c -io.o: io.c $(PROGRAM_FILES) - gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +io.o: io.c + $(CC) $(CFLAGS) $(INCLUDES) -c io.c libzstd.a: $(MAKE) -C ../../lib libzstd.a @@ -44,8 +47,6 @@ testshell: test.sh .PHONY: clean clean: - rm -f libzstd.a main - rm -f ../../lib/*/*.o - rm -f ../../programs/*.o - rm -f *.o + rm -f *.o main libzstd.a + $(MAKE) -C ../../lib clean echo "Cleaning is completed"