diff --git a/contrib/randomDictBuilder/Makefile b/contrib/randomDictBuilder/Makefile new file mode 100644 index 00000000..a2aade23 --- /dev/null +++ b/contrib/randomDictBuilder/Makefile @@ -0,0 +1,48 @@ +PROGRAM_FILES := ../../programs/fileio.c + +TEST_INPUT := ../../lib +TEST_OUTPUT := randomDict +ARG := + +all: main testrun test clean + +run: main rand clean + +.PHONY: rand +rand: + echo "Building a random dictionary with given arguments" + ./main $(ARG) + + +main: random.o main.o libzstd.a + gcc random.o main.o libzstd.a -o main + +main.o: main.c + gcc -c main.c -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder -I random.h + +random.o: $(PROGRAM_FILES) random.c + gcc -c $(PROGRAM_FILES) -I ../../programs -I ../../lib/common -I random.h random.c + +libzstd.a: + $(MAKE) -C ../../lib libzstd.a + mv ../../lib/libzstd.a . + +.PHONY: testrun +testrun: main + echo "Run with $(TEST_INPUT) and $(TEST_OUTPUT) " + ./main in=$(TEST_INPUT) out=$(TEST_OUTPUT) + zstd -be3 -D $(TEST_OUTPUT) -r $(TEST_INPUT) -q + rm -f $(TEST_OUTPUT) + +.PHONY: test +test: test.sh + sh test.sh + echo "Finish running test.sh" + +.PHONY: clean +clean: + rm -f libzstd.a main + rm -f ../../lib/*/*.o + rm -f ../../programs/*.o + rm -f *.o + echo "Cleaning is completed" diff --git a/contrib/randomDictBuilder/README.md b/contrib/randomDictBuilder/README.md new file mode 100644 index 00000000..cadffdf2 --- /dev/null +++ b/contrib/randomDictBuilder/README.md @@ -0,0 +1,13 @@ +Random Dictionary Builder + +### Permitted Arguments: +Input Files (in=fileName): files used to build dictionary, can include multiple files, each following "in=", required +Output Dictionary (out=dictName): if not provided, default to defaultDict +Dictionary ID (dictID=#): positive number, if not provided, default to 0 +Maximum Dictionary Size (maxdict=#): positive number, in bytes, if not provided, default to 110KB +Size of Randomly Selected Segment (k=#): positive number, in bytes, if not provided, default to 200 +Compression Level (c=#): positive number, if not provided, default to 3 + +### Examples: +make run ARG="in=../../lib/dictBuilder out=dict100 dictID=520" +make run ARG="in=../../lib/dictBuilder in=../../lib/compress" diff --git a/contrib/randomDictBuilder/main.c b/contrib/randomDictBuilder/main.c new file mode 100644 index 00000000..15eb5c44 --- /dev/null +++ b/contrib/randomDictBuilder/main.c @@ -0,0 +1,125 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "random.h" +#include "util.h" + +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_INPUTFILE "" +#define DEFAULT_k 200 +#define DEFAULT_OUTPUTFILE "defaultDict" +#define DEFAULT_DICTID 0 + + +static unsigned readU32FromChar(const char** stringPtr) +{ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) +{ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + unsigned cLevel = DEFAULT_CLEVEL; + char* inputFile = DEFAULT_INPUTFILE; + unsigned k = DEFAULT_k; + char* outputFile = DEFAULT_OUTPUTFILE; + unsigned dictID = DEFAULT_DICTID; + unsigned maxDictSize = g_defaultMaxDictSize; + + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "c=")) { cLevel = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + inputFile = malloc(strlen(argument) + 1); + strcpy(inputFile, argument); + filenameTable[filenameIdx] = inputFile; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = malloc(strlen(argument) + 1); + strcpy(outputFile, argument); + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; + const char** extendedFileList = NULL; + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "random.h" +#include "platform.h" /* Large Files support */ +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ + +/*-************************************* +* Constants +***************************************/ +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 +#define DEFAULT_K 200 + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, + size_t* sampleSizes, unsigned sstSize, + const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize, + unsigned displayLevel) +{ + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + + + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t findMaxMem(unsigned long long requiredMem) +{ + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +static void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) +{ + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel) +{ + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + + +/* ******************************************************** +* Random Dictionary Builder +**********************************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/** + * Selects a random segment from totalSamplesSize - k + 1 possible segments + */ +static RANDOM_segment_t RANDOM_selectSegment(const RANDOM_ctx_t *ctx, + ZDICT_random_params_t parameters) { + const U32 k = parameters.k; + RANDOM_segment_t segment; + unsigned index; + + /* Seed random number generator */ + srand((unsigned)time(NULL)); + /* Randomly generate a number from 0 to sampleSizes - k */ + index = rand()%(ctx->totalSamplesSize - k + 1); + + /* inclusive */ + segment.begin = index; + segment.end = index + k - 1; + + return segment; +} + + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int RANDOM_checkParameters(ZDICT_random_params_t parameters, size_t maxDictSize) { + /* k is a required parameter */ + if (parameters.k == 0) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `RANDOM_ctx_init()`. + */ +static void RANDOM_ctx_destroy(RANDOM_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + + +/** + * Prepare a context for dictionary building. + * Returns 1 on success or zero on error. + * The context must be destroyed with `RANDOM_ctx_destroy()`. + */ +static int RANDOM_ctx_init(RANDOM_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const int displayLevel = 2; + /* Checks */ + if (totalSamplesSize >= (size_t)RANDOM_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (RANDOM_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(1, "Building dictionary from %u samples of total size %u\n", nbSamples, + (U32)totalSamplesSize); + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + ctx->totalSamplesSize = (U32)totalSamplesSize; + if (!ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate buffer for offsets\n"); + RANDOM_ctx_destroy(ctx); + return 0; + } + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t RANDOM_buildDictionary(const RANDOM_ctx_t *ctx, void *dictBuffer, + size_t dictBufferCapacity, + ZDICT_random_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + const int displayLevel = parameters.zParams.notificationLevel; + while (tail > 0) { + + /* Select a segment */ + RANDOM_segment_t segment = RANDOM_selectSegment(ctx, parameters); + + size_t segmentSize; + segmentSize = MIN(segment.end - segment.begin + 1, tail); + + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + + return tail; +} + +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples using the RANDOM algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters) { + const int displayLevel = parameters.zParams.notificationLevel; + BYTE* const dict = (BYTE*)dictBuffer; + RANDOM_ctx_t ctx; + /* Checks */ + if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "k is incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Random must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + + if (!RANDOM_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples)) { + return ERROR(GENERIC); + } + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = RANDOM_buildDictionary(&ctx, dictBuffer, dictBufferCapacity, parameters); + const size_t dictSize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbSamples, parameters.zParams); + if (!ZSTD_isError(dictSize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictSize); + } + RANDOM_ctx_destroy(&ctx); + return dictSize; + } +} + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params){ + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + int result = 0; + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer)) + EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel); + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, srcBuffer, + sampleSizes, fs.nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _cleanup; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_cleanup: + free(srcBuffer); + free(sampleSizes); + free(dictBuffer); + return result; +} diff --git a/contrib/randomDictBuilder/random.h b/contrib/randomDictBuilder/random.h new file mode 100644 index 00000000..05879641 --- /dev/null +++ b/contrib/randomDictBuilder/random.h @@ -0,0 +1,53 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + +/************************************** +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + U32 totalSamplesSize; +} RANDOM_ctx_t; + +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */ + ZDICT_params_t zParams; +} ZDICT_random_params_t; + + +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + + +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters); + + +int RANDOM_trainFromFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, ZDICT_random_params_t *params); diff --git a/contrib/randomDictBuilder/test.sh b/contrib/randomDictBuilder/test.sh new file mode 100644 index 00000000..552650ee --- /dev/null +++ b/contrib/randomDictBuilder/test.sh @@ -0,0 +1,14 @@ +echo "Building random dictionary with c=5 in=../../lib/common k=200 out=dict1" +./main c=5 in=../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../lib/common -q +echo "Building random dictionary with c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main c=9 in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../lib/common -q +echo "Building random dictionary with 2 sample sources" +./main in=../../lib/common in=../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../lib/common -q +echo "Removing dict1 dict2 dict3" +rm -f dict1 dict2 dict3 + +echo "Testing with invalid parameters, should fail" +! ./main r=10