From b5806d33db813dfb2bac7cd3b97b5bcf09ee57b7 Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Mon, 16 Jul 2018 16:03:04 -0700 Subject: [PATCH] Refactor RANDOM --- contrib/randomDictBuilder/Makefile | 12 +- contrib/randomDictBuilder/main.c | 297 ++++++++++++++++++++++++- contrib/randomDictBuilder/random.c | 343 ++--------------------------- contrib/randomDictBuilder/random.h | 23 -- 4 files changed, 314 insertions(+), 361 deletions(-) diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index a2aade23..443f6f04 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -14,14 +14,14 @@ rand: ./main $(ARG) -main: random.o main.o libzstd.a - gcc random.o main.o libzstd.a -o main +main: main.o random.o libzstd.a + gcc main.o random.o libzstd.a -o main -main.o: main.c - gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h +main.o: main.c $(PROGRAM_FILES) + gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -random.o: $(PROGRAM_FILES) random.c - gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c +random.o: random.c + gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder libzstd.a: $(MAKE) -C ../../lib libzstd.a diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index cf0b9476..d9295aa9 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -3,13 +3,45 @@ #include /* strcmp, strlen */ #include /* errno */ #include -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ #include "random.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ #include "util.h" +#include "zdict.h" +/*-************************************* +* Console display +***************************************/ #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + +/*-************************************* +* Constants +***************************************/ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_CLEVEL 3 #define DEFAULT_INPUTFILE "" @@ -17,7 +49,33 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); +#define NOISELENGTH 32 + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +/*-************************************* +* Commandline related functions +***************************************/ static unsigned readU32FromChar(const char** stringPtr) { const char errorMsg[] = "error: numeric value too large"; @@ -42,7 +100,6 @@ static unsigned readU32FromChar(const char** stringPtr) return result; } - /** longCommandWArg() : * check if *stringPtr is the same as longCommand. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. @@ -56,6 +113,225 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + free(dictBuffer); + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ + free(dictBuffer); + return result; +} + +sampleInfo* getSampleInfo(const char** fileNamesTable, + unsigned nbFiles, size_t chunkSize, unsigned maxDictSize, const unsigned displayLevel){ + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} + + int main(int argCount, const char* argv[]) { @@ -63,7 +339,7 @@ int main(int argCount, const char* argv[]) const char* programName = argv[0]; int operationResult = 0; - /* Initialize parameters with default value */ + unsigned cLevel = DEFAULT_CLEVEL; char* inputFile = DEFAULT_INPUTFILE; unsigned k = DEFAULT_k; char* outputFile = DEFAULT_OUTPUTFILE; @@ -76,10 +352,10 @@ int main(int argCount, const char* argv[]) for (int i = 1; i < argCount; i++) { const char* argument = argv[i]; if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "in=")) { - /* Allow multiple input files */ inputFile = malloc(strlen(argument) + 1); strcpy(inputFile, argument); filenameTable[filenameIdx] = inputFile; @@ -96,12 +372,6 @@ int main(int argCount, const char* argv[]) return operationResult; } - if (maxDictSize == 0) { - DISPLAYLEVEL(1, "maxDictSize should not be 0.\n"); - operationResult = 1; - return operationResult; - } - char* fileNamesBuf = NULL; unsigned fileNamesNb = filenameIdx; int followLinks = 0; @@ -119,12 +389,15 @@ int main(int argCount, const char* argv[]) ZDICT_random_params_t params; ZDICT_params_t zParams; - zParams.compressionLevel = DEFAULT_CLEVEL; + zParams.compressionLevel = cLevel; zParams.notificationLevel = displayLevel; zParams.dictID = dictID; params.zParams = zParams; params.k = k; - operationResult = RANDOM_trainFromFiles(outputFile, maxDictSize, filenameTable, filenameIdx, blockSize, ¶ms); + sampleInfo* info= getSampleInfo(filenameTable, + filenameIdx, blockSize, maxDictSize, zParams.notificationLevel); + operationResult = RANDOM_trainFromFiles(outputFile, info, maxDictSize, ¶ms); + return operationResult; } diff --git a/contrib/randomDictBuilder/random.c b/contrib/randomDictBuilder/random.c index a59427ba..96c02389 100644 --- a/contrib/randomDictBuilder/random.c +++ b/contrib/randomDictBuilder/random.c @@ -5,24 +5,12 @@ #include /* malloc, free, qsort */ #include /* memset */ #include /* clock */ -#include "zstd_internal.h" /* includes zstd.h */ +#include "random.h" +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ #ifndef ZDICT_STATIC_LINKING_ONLY #define ZDICT_STATIC_LINKING_ONLY #endif -#include "random.h" -#include "platform.h" /* Large Files support */ -#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ - -/*-************************************* -* Constants -***************************************/ -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 -#define DEFAULT_K 200 +#include "zdict.h" /*-************************************* * Console display @@ -30,179 +18,16 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((siz #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } -static const U64 g_refreshRate = SEC_TO_MICRO / 6; -static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; - -#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ - if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ - { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ - if (displayLevel>=4) fflush(stderr); } } } - - -/*-************************************* -* Exceptions -***************************************/ -#ifndef DEBUG -# define DEBUG 0 -#endif -#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); -#define EXM_THROW(error, ...) \ -{ \ - DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ - DISPLAY("Error %i : ", error); \ - DISPLAY(__VA_ARGS__); \ - DISPLAY("\n"); \ - exit(error); \ -} - - -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, - size_t* sampleSizes, unsigned sstSize, - const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, - unsigned displayLevel) -{ - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - - - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} - - - -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) -{ - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) -{ - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) -{ - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - - +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; @@ -225,16 +50,14 @@ static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { /** * Selects a random segment from totalSamplesSize - k + 1 possible segments */ -static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, +static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize, ZDICT_random_params_t parameters) { const U32 k = parameters.k; RANDOM_segment_t segment; unsigned index; - /* Seed random number generator */ - srand((unsigned)time(NULL)); /* Randomly generate a number from 0 to sampleSizes - k */ - index = rand()%(ctx->totalSamplesSize - k + 1); + index = rand()%(totalSamplesSize - k + 1); /* inclusive */ segment.begin = index; @@ -261,65 +84,11 @@ static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDi } -/** - * Clean up a context initialized with `RANDOM_ctx_init()`. - */ -static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { - if (!ctx) { - return; - } - if (ctx->offsets) { - free(ctx->offsets); - ctx->offsets = NULL; - } -} - - -/** - * Prepare a context for dictionary building. - * Returns 1 on success or zero on error. - * The context must be destroyed with `RANDOM_ctx_destroy()`. - */ -static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples) { - const BYTE *const samples = (const BYTE *)samplesBuffer; - const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); - const int displayLevel = 2; - /* Checks */ - if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { - DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", - (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); - return 0; - } - memset(ctx, 0, sizeof(*ctx)); - DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, - (U32)totalSamplesSize); - ctx->samples = samples; - ctx->samplesSizes = samplesSizes; - ctx->nbSamples = nbSamples; - ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); - ctx->totalSamplesSize = (U32)totalSamplesSize; - if (!ctx->offsets) { - DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); - RANDOM_ctx_destroy(ctx); - return 0; - } - { - U32 i; - ctx->offsets[0] = 0; - for (i = 1; i <= nbSamples; ++i) { - ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; - } - } - return 1; -} - - /** * Given the prepared context build the dictionary. */ -static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, - size_t dictBufferCapacity, +static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples, + void *dictBuffer, size_t dictBufferCapacity, ZDICT_random_params_t parameters) { BYTE *const dict = (BYTE *)dictBuffer; size_t tail = dictBufferCapacity; @@ -327,13 +96,13 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, while (tail > 0) { /* Select a segment */ - RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters); size_t segmentSize; segmentSize = MIN(segment.end - segment.begin + 1, tail); tail -= segmentSize; - memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + memcpy(dict + tail, samples + segment.begin, segmentSize); DISPLAYUPDATE( 2, "\r%u%% ", (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); @@ -342,6 +111,7 @@ static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, return tail; } + /*! ZDICT_trainFromBuffer_random(): * Train a dictionary from an array of samples using the RANDOM algorithm. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, @@ -356,7 +126,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_random_params_t parameters) { const int displayLevel = parameters.zParams.notificationLevel; BYTE* const dict = (BYTE*)dictBuffer; - RANDOM_ctx_t ctx; /* Checks */ if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "k is incorrect\n"); @@ -371,13 +140,12 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( ZDICT_DICTSIZE_MIN); return ERROR(dstSize_tooSmall); } + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const BYTE *const samples = (const BYTE *)samplesBuffer; - if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { - return ERROR(GENERIC); - } DISPLAYLEVEL(2, "Building dictionary\n"); { - const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, dictBuffer, dictBufferCapacity, parameters); const size_t dictSize = ZDICT_finalizeDictionary( dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, samplesBuffer, samplesSizes, nbSamples, parameters.zParams); @@ -385,71 +153,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", (U32)dictSize); } - RANDOM_ctx_destroy(&ctx); return dictSize; } } - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params){ - unsigned const displayLevel = params->zParams.notificationLevel; - void* const dictBuffer = malloc(maxDictSize); - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - int result = 0; - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) - EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); - - { size_t dictSize; - dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, - sampleSizes, fs.nbSamples, *params); - DISPLAYLEVEL(2, "k=%u\n", params->k); - if (ZDICT_isError(dictSize)) { - DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ - result = 1; - goto _cleanup; - } - /* save dict */ - DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - saveDict(dictFileName, dictBuffer, dictSize); - } - - /* clean up */ -_cleanup: - free(srcBuffer); - free(sampleSizes); - free(dictBuffer); - return result; -} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h index 05879641..77529daf 100644 --- a/contrib/randomDictBuilder/random.h +++ b/contrib/randomDictBuilder/random.h @@ -8,18 +8,6 @@ #endif #include "zdict.h" - -/************************************** -* Context -***************************************/ -typedef struct { - const BYTE *samples; - size_t *offsets; - const size_t *samplesSizes; - size_t nbSamples; - U32 totalSamplesSize; -} RANDOM_ctx_t; - /** * A segment is an inclusive range in the source. */ @@ -35,19 +23,8 @@ typedef struct { } ZDICT_random_params_t; -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_random_params_t parameters); - - -int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, - const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, ZDICT_random_params_t *params);