From 49acfaeaec44a25c4628a2512965445152e8776a Mon Sep 17 00:00:00 2001 From: Jennifer Liu Date: Tue, 17 Jul 2018 12:35:09 -0700 Subject: [PATCH] Move file loading functions to new file for access by benchmarking tool --- contrib/randomDictBuilder/Makefile | 11 +- contrib/randomDictBuilder/io.c | 243 +++++++++++++++++++++++++++++ contrib/randomDictBuilder/io.h | 33 ++++ contrib/randomDictBuilder/main.c | 215 +------------------------ 4 files changed, 290 insertions(+), 212 deletions(-) create mode 100644 contrib/randomDictBuilder/io.c create mode 100644 contrib/randomDictBuilder/io.h diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile index 77dd2933..8360a409 100644 --- a/contrib/randomDictBuilder/Makefile +++ b/contrib/randomDictBuilder/Makefile @@ -13,15 +13,18 @@ run: echo "Building a random dictionary with given arguments" ./main $(ARG) -main: main.o random.o libzstd.a - gcc main.o random.o libzstd.a -o main +main: main.o io.o random.o libzstd.a + gcc main.o io.o random.o libzstd.a -o main -main.o: main.c $(PROGRAM_FILES) - gcc -c main.c $(PROGRAM_FILES) -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder +main.o: main.c + gcc -c main.c -I io.h -I random.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder random.o: random.c gcc -c random.c -I random.h -I ../../lib/common -I ../../lib/dictBuilder +io.o: io.c $(PROGRAM_FILES) + gcc -c io.c $(PROGRAM_FILES) -I io.h -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder + libzstd.a: $(MAKE) -C ../../lib libzstd.a mv ../../lib/libzstd.a . diff --git a/contrib/randomDictBuilder/io.c b/contrib/randomDictBuilder/io.c new file mode 100644 index 00000000..a5f71498 --- /dev/null +++ b/contrib/randomDictBuilder/io.c @@ -0,0 +1,243 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "io.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ + +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 + + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) { + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) { + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} diff --git a/contrib/randomDictBuilder/io.h b/contrib/randomDictBuilder/io.h new file mode 100644 index 00000000..4b5639fe --- /dev/null +++ b/contrib/randomDictBuilder/io.h @@ -0,0 +1,33 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "zstd_internal.h" /* includes zstd.h */ +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel); + + +void saveDict(const char* dictFileName, const void* buff, size_t buffSize); diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c index e66f2847..34a9d99e 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/randomDictBuilder/main.c @@ -4,11 +4,11 @@ #include /* errno */ #include #include "random.h" -#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ -#include "platform.h" /* Large Files support */ +#include "io.h" #include "util.h" #include "zdict.h" + /*-************************************* * Console display ***************************************/ @@ -23,6 +23,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ if (displayLevel>=4) fflush(stderr); } } } + /*-************************************* * Exceptions ***************************************/ @@ -39,6 +40,7 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; exit(error); \ } + /*-************************************* * Constants ***************************************/ @@ -49,29 +51,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB; #define DEFAULT_OUTPUTFILE "defaultDict" #define DEFAULT_DICTID 0 -#define SAMPLESIZE_MAX (128 KB) -#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) -#define RANDOM_MEMMULT 9 -static const size_t g_maxMemory = (sizeof(size_t) == 4) ? - (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); - -#define NOISELENGTH 32 - - -/*-************************************* -* Structs -***************************************/ -typedef struct { - U64 totalSizeToLoad; - unsigned oneSampleTooLarge; - unsigned nbSamples; -} fileStats; - -typedef struct { - const void* srcBuffer; - const size_t *samplesSizes; - size_t nbSamples; -}sampleInfo; /*-************************************* @@ -112,144 +91,11 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) return result; } -/* ******************************************************** -* File related operations -**********************************************************/ -/** loadFiles() : - * load samples from files listed in fileNamesTable into buffer. - * works even if buffer is too small to load all samples. - * Also provides the size of each sample into sampleSizes table - * which must be sized correctly, using DiB_fileStats(). - * @return : nb of samples effectively loaded into `buffer` - * *bufferSizePtr is modified, it provides the amount data loaded within buffer. - * sampleSizes is filled with the size of each sample. - */ -static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, - unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, - size_t targetChunkSize, unsigned displayLevel) { - char* const buff = (char*)buffer; - size_t pos = 0; - unsigned nbLoadedChunks = 0, fileIndex; - - for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; - { size_t const readSize = fread(buff+pos, 1, toLoad, f); - if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); - pos += readSize; - sampleSizes[nbLoadedChunks++] = toLoad; - remainingToLoad -= targetChunkSize; - if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ - fileIndex = nbFiles; /* stop there */ - break; - } - if (toLoad < targetChunkSize) { - fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); - } } } - fclose(f); - } - DISPLAYLEVEL(2, "\r%79s\r", ""); - *bufferSizePtr = pos; - DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) - return nbLoadedChunks; -} - -#define rotl32(x,r) ((x << r) | (x >> (32 - r))) -static U32 getRand(U32* src) -{ - static const U32 prime1 = 2654435761U; - static const U32 prime2 = 2246822519U; - U32 rand32 = *src; - rand32 *= prime1; - rand32 ^= prime2; - rand32 = rotl32(rand32, 13); - *src = rand32; - return rand32 >> 5; -} - -/* shuffle() : - * shuffle a table of file names in a semi-random way - * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, - * it will load random elements from it, instead of just the first ones. */ -static void shuffle(const char** fileNamesTable, unsigned nbFiles) { - U32 seed = 0xFD2FB528; - unsigned i; - for (i = nbFiles - 1; i > 0; --i) { - unsigned const j = getRand(&seed) % (i + 1); - const char* const tmp = fileNamesTable[j]; - fileNamesTable[j] = fileNamesTable[i]; - fileNamesTable[i] = tmp; - } -} -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) { - size_t const step = 8 MB; - void* testmem = NULL; - - requiredMem = (((requiredMem >> 23) + 1) << 23); - requiredMem += step; - if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; - - while (!testmem) { - testmem = malloc((size_t)requiredMem); - requiredMem -= step; - } - - free(testmem); - return (size_t)requiredMem; -} - -static void saveDict(const char* dictFileName, - const void* buff, size_t buffSize) { - FILE* const f = fopen(dictFileName, "wb"); - if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); - - { size_t const n = fwrite(buff, 1, buffSize, f); - if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } - - { size_t const n = (size_t)fclose(f); - if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } -} - -/*! getFileStats() : - * Given a list of files, and a chunkSize (0 == no chunk, whole files) - * provides the amount of data to be loaded and the resulting nb of samples. - * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. - */ -static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, - size_t chunkSize, unsigned displayLevel) { - fileStats fs; - unsigned n; - memset(&fs, 0, sizeof(fs)); - for (n=0; n 2*SAMPLESIZE_MAX); - fs.nbSamples += nbSamples; - } - DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); - return fs; -} - +/*-************************************* +* RANDOM +***************************************/ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, unsigned maxDictSize, ZDICT_random_params_t *params) { @@ -281,53 +127,6 @@ int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, return result; } -sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, - unsigned maxDictSize, const unsigned displayLevel) { - fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); - size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); - size_t const memMult = RANDOM_MEMMULT; - size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; - size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); - void* const srcBuffer = malloc(loadedSize+NOISELENGTH); - - /* Checks */ - if ((!sampleSizes) || (!srcBuffer)) - EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ - if (fs.oneSampleTooLarge) { - DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); - DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); - DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); - } - if (fs.nbSamples < 5) { - DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); - DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); - DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); - EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ - } - if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { - DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); - DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); - } - - /* init */ - if (loadedSize < fs.totalSizeToLoad) - DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); - - /* Load input buffer */ - DISPLAYLEVEL(3, "Shuffling input files\n"); - shuffle(fileNamesTable, nbFiles); - nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, - fileNamesTable, nbFiles, chunkSize, displayLevel); - - sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); - - info->nbSamples = fs.nbSamples; - info->samplesSizes = sampleSizes; - info->srcBuffer = srcBuffer; - - return info; -} - int main(int argCount, const char* argv[])