diff --git a/.travis.yml b/.travis.yml index b7870b2d..d41a6774 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,17 @@ matrix: - env: Cmd='make clang38install && CC=clang-3.8 make clean tsan-test-zstream' - env: Cmd='make arminstall && make armfuzz' +<<<<<<< HEAD - env: Cmd='make arminstall && make aarch64fuzz' +======= + +# Following test is disabled, as there is a bug in Travis' ld +# preventing aarch64 compilation to complete. +# > collect2: error: ld terminated with signal 11 [Segmentation fault], core dumped +# to be re-enabled in a few commit, as it's possible that a random code change circumvent the ld bug +# - env: Cmd='make arminstall && make aarch64fuzz' + +>>>>>>> dev - env: Cmd='make ppcinstall && make ppcfuzz' - env: Cmd='make ppcinstall && make ppc64fuzz' diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile new file mode 100644 index 00000000..68149488 --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile @@ -0,0 +1,48 @@ +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../randomDictBuilder -I ../fastCover -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder + +RANDOM_FILE := ../randomDictBuilder/random.c +FAST_FILE := ../fastCover/fastCover.c +IO_FILE := ../randomDictBuilder/io.c + +all: run clean + +.PHONY: run +run: benchmark + echo "Benchmarking with $(ARG)" + ./benchmark $(ARG) + +.PHONY: test +test: benchmarkTest clean + +.PHONY: benchmarkTest +benchmarkTest: benchmark test.sh + sh test.sh + +benchmark: benchmark.o io.o random.o fastCover.o libzstd.a + $(CC) $(CFLAGS) benchmark.o io.o random.o fastCover.o libzstd.a -o benchmark + +benchmark.o: benchmark.c + $(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c + +random.o: $(RANDOM_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE) + +fastCover.o: $(FAST_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(FAST_FILE) + +io.o: $(IO_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) + +libzstd.a: + $(MAKE) -C ../../../lib libzstd.a + mv ../../../lib/libzstd.a . + +.PHONY: clean +clean: + rm -f *.o benchmark libzstd.a + $(MAKE) -C ../../../lib clean + echo "Cleaning is completed" diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md new file mode 100644 index 00000000..559776e2 --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -0,0 +1,126 @@ +Benchmarking Dictionary Builder + +### Permitted Argument: +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" + +###Running Test: +make test + +###Usage: +Benchmark given input files: make ARG= followed by permitted arguments + +### Examples: +make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" + +###Benchmarking Result: +- First Cover is optimize cover, second Cover uses optimized d and k from first one. +- For every f value of fastCover, the first one is optimize fastCover and the second one uses optimized d and k from first one. +- Fourth column is chosen d and fifth column is chosen k + +github: +NODICT 0.000025 2.999642 +RANDOM 0.030101 8.791189 +LEGACY 0.913108 8.173529 +COVER 59.234160 10.652243 8 1298 +COVER 6.258459 10.652243 8 1298 +FAST15 9.959246 10.555630 8 1874 +FAST15 0.077719 10.555630 8 1874 +FAST16 10.028343 10.701698 8 1106 +FAST16 0.078117 10.701698 8 1106 +FAST17 10.567355 10.650652 8 1106 +FAST17 0.124833 10.650652 8 1106 +FAST18 11.795287 10.499142 8 1826 +FAST18 0.086992 10.499142 8 1826 +FAST19 13.132451 10.527140 8 1826 +FAST19 0.134716 10.527140 8 1826 +FAST20 14.366314 10.494710 8 1826 +FAST20 0.128844 10.494710 8 1826 +FAST21 14.941238 10.503488 8 1778 +FAST21 0.134975 10.503488 8 1778 +FAST22 15.146226 10.509284 8 1826 +FAST22 0.146918 10.509284 8 1826 +FAST23 16.260552 10.509284 8 1826 +FAST23 0.158494 10.509284 8 1826 +FAST24 16.806037 10.512369 8 1826 +FAST24 0.190464 10.512369 8 1826 + +hg-commands: +NODICT 0.000026 2.425291 +RANDOM 0.046270 3.490331 +LEGACY 0.847904 3.911682 +COVER 71.691804 4.132653 8 386 +COVER 3.187085 4.132653 8 386 +FAST15 11.593687 3.920720 6 1106 +FAST15 0.082431 3.920720 6 1106 +FAST16 11.775958 4.033306 8 674 +FAST16 0.092587 4.033306 8 674 +FAST17 11.965064 4.064132 8 1490 +FAST17 0.106382 4.064132 8 1490 +FAST18 11.438197 4.086714 8 290 +FAST18 0.097293 4.086714 8 290 +FAST19 12.292512 4.097947 8 578 +FAST19 0.104406 4.097947 8 578 +FAST20 13.857857 4.102851 8 434 +FAST20 0.139467 4.102851 8 434 +FAST21 14.599613 4.105350 8 530 +FAST21 0.189416 4.105350 8 530 +FAST22 15.966109 4.104100 8 530 +FAST22 0.183817 4.104100 8 530 +FAST23 18.033645 4.098110 8 914 +FAST23 0.246641 4.098110 8 914 +FAST24 22.992891 4.117367 8 722 +FAST24 0.285994 4.117367 8 722 + +hg-changelog: +NODICT 0.000007 1.377613 +RANDOM 0.297345 2.097487 +LEGACY 2.633992 2.058907 +COVER 219.179786 2.189685 8 98 +COVER 6.620852 2.189685 8 98 +FAST15 47.635082 2.130794 6 386 +FAST15 0.321297 2.130794 6 386 +FAST16 43.837676 2.144845 8 194 +FAST16 0.312640 2.144845 8 194 +FAST17 49.349017 2.156099 8 242 +FAST17 0.348459 2.156099 8 242 +FAST18 51.153784 2.172439 6 98 +FAST18 0.353106 2.172439 6 98 +FAST19 52.627045 2.180321 6 98 +FAST19 0.390612 2.180321 6 98 +FAST20 63.748782 2.187431 6 98 +FAST20 0.489544 2.187431 6 98 +FAST21 68.709198 2.184185 6 146 +FAST21 0.530852 2.184185 6 146 +FAST22 68.491639 2.182830 6 98 +FAST22 0.645699 2.182830 6 98 +FAST23 72.558688 2.186399 8 98 +FAST23 0.593539 2.186399 8 98 +FAST24 76.137195 2.185608 6 98 +FAST24 0.680132 2.185608 6 98 + +hg-manifest: +NODICT 0.000026 1.866385 +RANDOM 0.784554 2.309436 +LEGACY 10.193714 2.506977 +COVER 988.206583 2.582528 8 434 +COVER 39.726199 2.582528 8 434 +FAST15 168.388819 2.392920 6 1826 +FAST15 1.272178 2.392920 6 1826 +FAST16 161.822607 2.480762 6 1922 +FAST16 1.164908 2.480762 6 1922 +FAST17 157.688544 2.548285 6 1682 +FAST17 1.222439 2.548285 6 1682 +FAST18 154.529585 2.567634 6 386 +FAST18 1.217596 2.567634 6 386 +FAST19 160.244979 2.581653 8 338 +FAST19 1.282450 2.581653 8 338 +FAST20 191.503297 2.586881 8 194 +FAST20 2.009748 2.586881 8 194 +FAST21 226.389709 2.590051 6 242 +FAST21 2.494543 2.590051 6 242 +FAST22 217.859055 2.591376 6 194 +FAST22 2.295693 2.591376 6 194 +FAST23 236.819791 2.591131 8 434 +FAST23 2.744711 2.591131 8 434 +FAST24 269.187800 2.591548 6 290 +FAST24 2.923671 2.591548 6 290 diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c new file mode 100644 index 00000000..d92e8d5c --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -0,0 +1,433 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include +#include "random.h" +#include "fastCover.h" +#include "dictBuilder.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "io.h" +#include "util.h" +#include "zdict.h" + + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_DISPLAYLEVEL 2 + + +/*-************************************* +* Struct +***************************************/ +typedef struct { + const void* dictBuffer; + size_t dictSize; +} dictInfo; + + +/*-************************************* +* Dictionary related operations +***************************************/ +/** createDictFromFiles() : + * Based on type of param given, train dictionary using the corresponding algorithm + * @return dictInfo containing dictionary buffer and dictionary size + */ +dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, + ZDICT_legacy_params_t *legacyParams, ZDICT_fastCover_params_t *fastParams) { + unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : + coverParams ? coverParams->zParams.notificationLevel : + legacyParams ? legacyParams->zParams.notificationLevel : + fastParams ? fastParams->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ + void* const dictBuffer = malloc(maxDictSize); + + dictInfo* dInfo = NULL; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + if(randomParams) { + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *randomParams); + }else if(coverParams) { + /* Run the optimize version if either k or d is not provided */ + if (!coverParams->d || !coverParams->k){ + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, coverParams); + } else { + dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *coverParams); + } + } else if(legacyParams) { + dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *legacyParams); + } else if(fastParams) { + /* Run the optimize version if either k or d is not provided */ + if (!fastParams->d || !fastParams->k) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, fastParams); + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *fastParams); + } + } else { + dictSize = 0; + } + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + free(dictBuffer); + return dInfo; + } + dInfo = (dictInfo *)malloc(sizeof(dictInfo)); + dInfo->dictBuffer = dictBuffer; + dInfo->dictSize = dictSize; + } + return dInfo; +} + + +/** compressWithDict() : + * Compress samples from sample buffer given dicionary stored on dictionary buffer and compression level + * @return compression ratio + */ +double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) { + /* Local variables */ + size_t totalCompressedSize = 0; + size_t totalOriginalSize = 0; + const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; + double cRatio; + size_t dstCapacity; + int i; + + /* Pointers */ + ZSTD_CDict *cdict = NULL; + ZSTD_CCtx* cctx = NULL; + size_t *offsets = NULL; + void* dst = NULL; + + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (int i = 0; i < srcInfo->nbSamples; i++) { + maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + + /* Calculate offset for each sample */ + offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t)); + offsets[0] = 0; + for (i = 1; i <= srcInfo->nbSamples; i++) { + offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1]; + } + + /* Create the cctx */ + cctx = ZSTD_createCCtx(); + if(!cctx || !dst) { + cRatio = -1; + goto _cleanup; + } + + /* Create CDict if there's a dictionary stored on buffer */ + if (hasDict) { + cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); + if(!cdict) { + cRatio = -1; + goto _cleanup; + } + } + + /* Compress each sample and sum their sizes*/ + const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer; + for (i = 0; i < srcInfo->nbSamples; i++) { + size_t compressedSize; + if(hasDict) { + compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + } else { + compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel); + } + if (ZSTD_isError(compressedSize)) { + cRatio = -1; + goto _cleanup; + } + totalCompressedSize += compressedSize; + } + + /* Sum orignal sizes */ + for (i = 0; inbSamples; i++) { + totalOriginalSize += srcInfo->samplesSizes[i]; + } + + /* Calculate compression ratio */ + DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize); + DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); + cRatio = (double)totalOriginalSize/(double)totalCompressedSize; + +_cleanup: + free(dst); + free(offsets); + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + return cRatio; +} + + +/** FreeDictInfo() : + * Free memory allocated for dictInfo + */ +void freeDictInfo(dictInfo* info) { + if (!info) return; + if (info->dictBuffer) free((void*)(info->dictBuffer)); + free(info); +} + + + +/*-******************************************************** + * Benchmarking functions +**********************************************************/ +/** benchmarkDictBuilder() : + * Measure how long a dictionary builder takes and compression ratio with the dictionary built + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam, + ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam, + ZDICT_fastCover_params_t *fastParam) { + /* Local variables */ + const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel : + coverParam ? coverParam->zParams.notificationLevel : + legacyParam ? legacyParam->zParams.notificationLevel : + fastParam ? fastParam->zParams.notificationLevel: + DEFAULT_DISPLAYLEVEL; /* no dict */ + const char* name = randomParam ? "RANDOM" : + coverParam ? "COVER" : + legacyParam ? "LEGACY" : + fastParam ? "FAST": + "NODICT"; /* no dict */ + const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel : + coverParam ? coverParam->zParams.compressionLevel : + legacyParam ? legacyParam->zParams.compressionLevel : + fastParam ? fastParam->zParams.compressionLevel: + DEFAULT_CLEVEL; /* no dict */ + int result = 0; + + /* Calculate speed */ + const UTIL_time_t begin = UTIL_getTime(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam, fastParam); + const U64 timeMicro = UTIL_clockSpanMicro(begin); + const double timeSec = timeMicro / (double)SEC_TO_MICRO; + if (!dInfo) { + DISPLAYLEVEL(1, "%s does not train successfully\n", name); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(1, "%s took %f seconds to execute \n", name, timeSec); + + /* Calculate compression ratio */ + const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name); + result = 1; + goto _cleanup; + + } + DISPLAYLEVEL(1, "Compression ratio with %s dictionary is %f\n", name, cRatio); + +_cleanup: + freeDictInfo(dInfo); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + const int displayLevel = DEFAULT_DISPLAYLEVEL; + const char* programName = argv[0]; + int result = 0; + + /* Initialize arguments to default values */ + unsigned k = 200; + unsigned d = 8; + const unsigned cLevel = DEFAULT_CLEVEL; + const unsigned dictID = 0; + const unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + const int followLinks = 0; + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n"); + return 1; + } + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "mem.h" /* read */ +#include "pool.h" +#include "threading.h" +#include "fastCover.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "zdict.h" + + +/*-************************************* +* Constants +***************************************/ +#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define FASTCOVER_MAX_F 32 +#define DEFAULT_SPLITPOINT 1.0 + +/*-************************************* +* Console display +***************************************/ +static int g_displayLevel = 2; +#define DISPLAY(...) \ + { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } +#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + DISPLAY(__VA_ARGS__); \ + } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) + +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; + + +/*-************************************* +* Hash Functions +***************************************/ +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + + +/** + * Hash the d-byte value pointed to by p and mod 2^f + */ +static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) { + if (d == 6) { + return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1); + } + return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1); +} + + +/*-************************************* +* Context +***************************************/ +typedef struct { + const BYTE *samples; + size_t *offsets; + const size_t *samplesSizes; + size_t nbSamples; + size_t nbTrainSamples; + size_t nbTestSamples; + size_t nbDmers; + U32 *freqs; + U16 *segmentFreqs; + unsigned d; +} FASTCOVER_ctx_t; + + +/*-************************************* +* Helper functions +***************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t FASTCOVER_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/*-************************************* +* fast functions +***************************************/ +/** + * A segment is a range in the source as well as the score of the segment. + */ +typedef struct { + U32 begin; + U32 end; + U32 score; +} FASTCOVER_segment_t; + + +/** + * Selects the best segment in an epoch. + * Segments of are scored according to the function: + * + * Let F(d) be the frequency of all dmers with hash value d. + * Let S_i be hash value of the dmer at position i of segment S which has length k. + * + * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1}) + * + * Once the dmer with hash value d is in the dictionay we set F(d) = F(d)/2. + */ +static FASTCOVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx, + U32 *freqs, U32 begin,U32 end, + ZDICT_fastCover_params_t parameters) { + /* Constants */ + const U32 k = parameters.k; + const U32 d = parameters.d; + const U32 dmersInK = k - d + 1; + /* Try each segment (activeSegment) and save the best (bestSegment) */ + FASTCOVER_segment_t bestSegment = {0, 0, 0}; + FASTCOVER_segment_t activeSegment; + /* Reset the activeDmers in the segment */ + /* The activeSegment starts at the beginning of the epoch. */ + activeSegment.begin = begin; + activeSegment.end = begin; + activeSegment.score = 0; + { + /* Slide the activeSegment through the whole epoch. + * Save the best segment in bestSegment. + */ + while (activeSegment.end < end) { + /* Get hash value of current dmer */ + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, parameters.f, ctx->d); + /* Add frequency of this index to score if this is the first occurence of index in active segment */ + if (ctx->segmentFreqs[index] == 0) { + activeSegment.score += freqs[index]; + } + ctx->segmentFreqs[index] += 1; + /* Increment end of segment */ + activeSegment.end += 1; + /* If the window is now too large, drop the first position */ + if (activeSegment.end - activeSegment.begin == dmersInK + 1) { + /* Get hash value of the dmer to be eliminated from active segment */ + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); + ctx->segmentFreqs[delIndex] -= 1; + /* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */ + if (ctx->segmentFreqs[delIndex] == 0) { + activeSegment.score -= freqs[delIndex]; + } + /* Increment start of segment */ + activeSegment.begin += 1; + } + /* If this segment is the best so far save it */ + if (activeSegment.score > bestSegment.score) { + bestSegment = activeSegment; + } + } + /* Zero out rest of segmentFreqs array */ + while (activeSegment.begin < end) { + const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, parameters.f, ctx->d); + ctx->segmentFreqs[delIndex] -= 1; + activeSegment.begin += 1; + } + } + { + /* Trim off the zero frequency head and tail from the segment. */ + U32 newBegin = bestSegment.end; + U32 newEnd = bestSegment.begin; + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + const size_t index = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); + U32 freq = freqs[index]; + if (freq != 0) { + newBegin = MIN(newBegin, pos); + newEnd = pos + 1; + } + } + bestSegment.begin = newBegin; + bestSegment.end = newEnd; + } + { + /* Half the frequency of hash value of each dmer covered by the chosen segment. */ + U32 pos; + for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) { + const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, parameters.f, ctx->d); + freqs[i] = 0; + } + } + return bestSegment; +} + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int FASTCOVER_checkParameters(ZDICT_fastCover_params_t parameters, + size_t maxDictSize) { + /* k, d, and f are required parameters */ + if (parameters.d == 0 || parameters.k == 0 || parameters.f == 0) { + return 0; + } + /* d has to be 6 or 8 */ + if (parameters.d != 6 && parameters.d != 8) { + return 0; + } + /* 0 < f <= FASTCOVER_MAX_F */ + if (parameters.f > FASTCOVER_MAX_F) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + /* d <= k */ + if (parameters.d > parameters.k) { + return 0; + } + /* 0 < splitPoint <= 1 */ + if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) { + return 0; + } + return 1; +} + + +/** + * Clean up a context initialized with `FASTCOVER_ctx_init()`. + */ +static void FASTCOVER_ctx_destroy(FASTCOVER_ctx_t *ctx) { + if (!ctx) { + return; + } + if (ctx->segmentFreqs) { + free(ctx->segmentFreqs); + ctx->segmentFreqs = NULL; + } + if (ctx->freqs) { + free(ctx->freqs); + ctx->freqs = NULL; + } + if (ctx->offsets) { + free(ctx->offsets); + ctx->offsets = NULL; + } +} + +/** + * Calculate for frequency of hash value of each dmer in ctx->samples + */ +static void FASTCOVER_computeFrequency(U32 *freqs, unsigned f, FASTCOVER_ctx_t *ctx){ + size_t start; /* start of current dmer */ + for (unsigned i = 0; i < ctx->nbTrainSamples; i++) { + size_t currSampleStart = ctx->offsets[i]; + size_t currSampleEnd = ctx->offsets[i+1]; + start = currSampleStart; + while (start + ctx->d <= currSampleEnd) { + const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, ctx->d); + freqs[dmerIndex]++; + start++; + } + } +} + +/** + * Prepare a context for dictionary building. + * The context is only dependent on the parameter `d` and can used multiple + * times. + * Returns 1 on success or zero on error. + * The context must be destroyed with `FASTCOVER_ctx_destroy()`. + */ +static int FASTCOVER_ctx_init(FASTCOVER_ctx_t *ctx, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + unsigned d, double splitPoint, unsigned f) { + const BYTE *const samples = (const BYTE *)samplesBuffer; + const size_t totalSamplesSize = FASTCOVER_sum(samplesSizes, nbSamples); + /* Split samples into testing and training sets */ + const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples; + const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples; + const size_t trainingSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize; + const size_t testSamplesSize = splitPoint < 1.0 ? FASTCOVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize; + /* Checks */ + if (totalSamplesSize < MAX(d, sizeof(U64)) || + totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) { + DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", + (U32)(totalSamplesSize>>20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20)); + return 0; + } + /* Check if there are at least 5 training samples */ + if (nbTrainSamples < 5) { + DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples); + return 0; + } + /* Check if there's testing sample */ + if (nbTestSamples < 1) { + DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples); + return 0; + } + /* Zero the context */ + memset(ctx, 0, sizeof(*ctx)); + DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples, + (U32)trainingSamplesSize); + DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples, + (U32)testSamplesSize); + + ctx->samples = samples; + ctx->samplesSizes = samplesSizes; + ctx->nbSamples = nbSamples; + ctx->nbTrainSamples = nbTrainSamples; + ctx->nbTestSamples = nbTestSamples; + ctx->nbDmers = trainingSamplesSize - d + 1; + ctx->d = d; + + /* The offsets of each file */ + ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t)); + if (!ctx->offsets) { + DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n"); + FASTCOVER_ctx_destroy(ctx); + return 0; + } + + /* Fill offsets from the samplesSizes */ + { + U32 i; + ctx->offsets[0] = 0; + for (i = 1; i <= nbSamples; ++i) { + ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1]; + } + } + + /* Initialize frequency array of size 2^f */ + ctx->freqs = (U32 *)calloc((1 << f), sizeof(U32)); + ctx->segmentFreqs = (U16 *)calloc((1 << f), sizeof(U16)); + DISPLAYLEVEL(2, "Computing frequencies\n"); + FASTCOVER_computeFrequency(ctx->freqs, f, ctx); + + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t FASTCOVER_buildDictionary(const FASTCOVER_ctx_t *ctx, U32 *freqs, + void *dictBuffer, + size_t dictBufferCapacity, + ZDICT_fastCover_params_t parameters){ + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + /* Divide the data up into epochs of equal size. + * We will select at least one segment from each epoch. + */ + const U32 epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k)); + const U32 epochSize = (U32)(ctx->nbDmers / epochs); + size_t epoch; + DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n", epochs, + epochSize); + /* Loop through the epochs until there are no more segments or the dictionary + * is full. + */ + for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) { + const U32 epochBegin = (U32)(epoch * epochSize); + const U32 epochEnd = epochBegin + epochSize; + size_t segmentSize; + /* Select a segment */ + FASTCOVER_segment_t segment = FASTCOVER_selectSegment( + ctx, freqs, epochBegin, epochEnd, parameters); + + /* If the segment covers no dmers, then we are out of content */ + if (segment.score == 0) { + break; + } + + /* Trim the segment if necessary and if it is too small then we are done */ + segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail); + if (segmentSize < parameters.d) { + break; + } + + /* We fill the dictionary from the back to allow the best segments to be + * referenced with the smallest offsets. + */ + tail -= segmentSize; + memcpy(dict + tail, ctx->samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + return tail; +} + + +/** + * FASTCOVER_best_t is used for two purposes: + * 1. Synchronizing threads. + * 2. Saving the best parameters and dictionary. + * + * All of the methods except FASTCOVER_best_init() are thread safe if zstd is + * compiled with multithreaded support. + */ +typedef struct fast_best_s { + ZSTD_pthread_mutex_t mutex; + ZSTD_pthread_cond_t cond; + size_t liveJobs; + void *dict; + size_t dictSize; + ZDICT_fastCover_params_t parameters; + size_t compressedSize; +} FASTCOVER_best_t; + +/** + * Initialize the `FASTCOVER_best_t`. + */ +static void FASTCOVER_best_init(FASTCOVER_best_t *best) { + if (best==NULL) return; /* compatible with init on NULL */ + (void)ZSTD_pthread_mutex_init(&best->mutex, NULL); + (void)ZSTD_pthread_cond_init(&best->cond, NULL); + best->liveJobs = 0; + best->dict = NULL; + best->dictSize = 0; + best->compressedSize = (size_t)-1; + memset(&best->parameters, 0, sizeof(best->parameters)); +} + +/** + * Wait until liveJobs == 0. + */ +static void FASTCOVER_best_wait(FASTCOVER_best_t *best) { + if (!best) { + return; + } + ZSTD_pthread_mutex_lock(&best->mutex); + while (best->liveJobs != 0) { + ZSTD_pthread_cond_wait(&best->cond, &best->mutex); + } + ZSTD_pthread_mutex_unlock(&best->mutex); +} + +/** + * Call FASTCOVER_best_wait() and then destroy the FASTCOVER_best_t. + */ +static void FASTCOVER_best_destroy(FASTCOVER_best_t *best) { + if (!best) { + return; + } + FASTCOVER_best_wait(best); + if (best->dict) { + free(best->dict); + } + ZSTD_pthread_mutex_destroy(&best->mutex); + ZSTD_pthread_cond_destroy(&best->cond); +} + +/** + * Called when a thread is about to be launched. + * Increments liveJobs. + */ +static void FASTCOVER_best_start(FASTCOVER_best_t *best) { + if (!best) { + return; + } + ZSTD_pthread_mutex_lock(&best->mutex); + ++best->liveJobs; + ZSTD_pthread_mutex_unlock(&best->mutex); +} + +/** + * Called when a thread finishes executing, both on error or success. + * Decrements liveJobs and signals any waiting threads if liveJobs == 0. + * If this dictionary is the best so far save it and its parameters. + */ +static void FASTCOVER_best_finish(FASTCOVER_best_t *best, size_t compressedSize, + ZDICT_fastCover_params_t parameters, void *dict, + size_t dictSize) { + if (!best) { + return; + } + { + size_t liveJobs; + ZSTD_pthread_mutex_lock(&best->mutex); + --best->liveJobs; + liveJobs = best->liveJobs; + /* If the new dictionary is better */ + if (compressedSize < best->compressedSize) { + /* Allocate space if necessary */ + if (!best->dict || best->dictSize < dictSize) { + if (best->dict) { + free(best->dict); + } + best->dict = malloc(dictSize); + if (!best->dict) { + best->compressedSize = ERROR(GENERIC); + best->dictSize = 0; + return; + } + } + /* Save the dictionary, parameters, and size */ + memcpy(best->dict, dict, dictSize); + best->dictSize = dictSize; + best->parameters = parameters; + best->compressedSize = compressedSize; + } + ZSTD_pthread_mutex_unlock(&best->mutex); + if (liveJobs == 0) { + ZSTD_pthread_cond_broadcast(&best->cond); + } + } +} + +/** + * Parameters for FASTCOVER_tryParameters(). + */ +typedef struct FASTCOVER_tryParameters_data_s { + const FASTCOVER_ctx_t *ctx; + FASTCOVER_best_t *best; + size_t dictBufferCapacity; + ZDICT_fastCover_params_t parameters; +} FASTCOVER_tryParameters_data_t; + +/** + * Tries a set of parameters and updates the FASTCOVER_best_t with the results. + * This function is thread safe if zstd is compiled with multithreaded support. + * It takes its parameters as an *OWNING* opaque pointer to support threading. + */ +static void FASTCOVER_tryParameters(void *opaque) { + /* Save parameters as local variables */ + FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque; + const FASTCOVER_ctx_t *const ctx = data->ctx; + const ZDICT_fastCover_params_t parameters = data->parameters; + size_t dictBufferCapacity = data->dictBufferCapacity; + size_t totalCompressedSize = ERROR(GENERIC); + /* Allocate space for hash table, dict, and freqs */ + BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + U32 *freqs = (U32*) malloc((1 << parameters.f) * sizeof(U32)); + if (!dict || !freqs) { + DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); + goto _cleanup; + } + /* Copy the frequencies because we need to modify them */ + memcpy(freqs, ctx->freqs, (1 << parameters.f) * sizeof(U32)); + /* Build the dictionary */ + { + const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, + dictBufferCapacity, parameters); + + dictBufferCapacity = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, + parameters.zParams); + if (ZDICT_isError(dictBufferCapacity)) { + DISPLAYLEVEL(1, "Failed to finalize dictionary\n"); + goto _cleanup; + } + } + /* Check total compressed size */ + { + /* Pointers */ + ZSTD_CCtx *cctx; + ZSTD_CDict *cdict; + void *dst; + /* Local variables */ + size_t dstCapacity; + size_t i; + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { + maxSampleSize = MAX(ctx->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + /* Create the cctx and cdict */ + cctx = ZSTD_createCCtx(); + cdict = ZSTD_createCDict(dict, dictBufferCapacity, + parameters.zParams.compressionLevel); + if (!dst || !cctx || !cdict) { + goto _compressCleanup; + } + /* Compress each sample and sum their sizes (or error) */ + totalCompressedSize = dictBufferCapacity; + i = parameters.splitPoint < 1.0 ? ctx->nbTrainSamples : 0; + for (; i < ctx->nbSamples; ++i) { + const size_t size = ZSTD_compress_usingCDict( + cctx, dst, dstCapacity, ctx->samples + ctx->offsets[i], + ctx->samplesSizes[i], cdict); + if (ZSTD_isError(size)) { + totalCompressedSize = ERROR(GENERIC); + goto _compressCleanup; + } + totalCompressedSize += size; + } + _compressCleanup: + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + if (dst) { + free(dst); + } + } + +_cleanup: + FASTCOVER_best_finish(data->best, totalCompressedSize, parameters, dict, + dictBufferCapacity); + free(data); + if (dict) { + free(dict); + } + if (freqs) { + free(freqs); + } +} + +ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) { + BYTE* const dict = (BYTE*)dictBuffer; + FASTCOVER_ctx_t ctx; + parameters.splitPoint = 1.0; + /* Initialize global data */ + g_displayLevel = parameters.zParams.notificationLevel; + /* Checks */ + if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + /* Initialize context */ + if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, + parameters.d, parameters.splitPoint, parameters.f)) { + DISPLAYLEVEL(1, "Failed to initialize context\n"); + return ERROR(GENERIC); + } + /* Build the dictionary */ + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer, + dictBufferCapacity, parameters); + + const size_t dictionarySize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples, + parameters.zParams); + if (!ZSTD_isError(dictionarySize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictionarySize); + } + FASTCOVER_ctx_destroy(&ctx); + return dictionarySize; + } +} + + + +ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t *parameters) { + /* constants */ + const unsigned nbThreads = parameters->nbThreads; + const double splitPoint = + parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; + const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; + const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; + const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k; + const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps; + const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1); + const unsigned kIterations = + (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize); + const unsigned f = parameters->f == 0 ? 23 : parameters->f; + + /* Local variables */ + const int displayLevel = parameters->zParams.notificationLevel; + unsigned iteration = 1; + unsigned d; + unsigned k; + FASTCOVER_best_t best; + POOL_ctx *pool = NULL; + + /* Checks */ + if (splitPoint <= 0 || splitPoint > 1) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n"); + return ERROR(GENERIC); + } + if (kMinK < kMaxD || kMaxK < kMinK) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + if (nbThreads > 1) { + pool = POOL_create(nbThreads, 1); + if (!pool) { + return ERROR(memory_allocation); + } + } + /* Initialization */ + FASTCOVER_best_init(&best); + /* Turn down global display level to clean up display at level 2 and below */ + g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1; + /* Loop through d first because each new value needs a new context */ + LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n", + kIterations); + for (d = kMinD; d <= kMaxD; d += 2) { + /* Initialize the context for this value of d */ + FASTCOVER_ctx_t ctx; + LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d); + if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f)) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n"); + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return ERROR(GENERIC); + } + /* Loop through k reusing the same context */ + for (k = kMinK; k <= kMaxK; k += kStepSize) { + /* Prepare the arguments */ + FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc( + sizeof(FASTCOVER_tryParameters_data_t)); + LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k); + if (!data) { + LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n"); + FASTCOVER_best_destroy(&best); + FASTCOVER_ctx_destroy(&ctx); + POOL_free(pool); + return ERROR(GENERIC); + } + data->ctx = &ctx; + data->best = &best; + data->dictBufferCapacity = dictBufferCapacity; + data->parameters = *parameters; + data->parameters.k = k; + data->parameters.d = d; + data->parameters.f = f; + data->parameters.splitPoint = splitPoint; + data->parameters.steps = kSteps; + data->parameters.zParams.notificationLevel = g_displayLevel; + /* Check the parameters */ + if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "fastCover parameters incorrect\n"); + free(data); + continue; + } + /* Call the function and pass ownership of data to it */ + FASTCOVER_best_start(&best); + if (pool) { + POOL_add(pool, &FASTCOVER_tryParameters, data); + } else { + FASTCOVER_tryParameters(data); + } + /* Print status */ + LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ", + (U32)((iteration * 100) / kIterations)); + ++iteration; + } + FASTCOVER_best_wait(&best); + FASTCOVER_ctx_destroy(&ctx); + } + LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", ""); + /* Fill the output buffer and parameters with output of the best parameters */ + { + const size_t dictSize = best.dictSize; + if (ZSTD_isError(best.compressedSize)) { + const size_t compressedSize = best.compressedSize; + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return compressedSize; + } + *parameters = best.parameters; + memcpy(dictBuffer, best.dict, dictSize); + FASTCOVER_best_destroy(&best); + POOL_free(pool); + return dictSize; + } + +} diff --git a/contrib/experimental_dict_builders/fastCover/fastCover.h b/contrib/experimental_dict_builders/fastCover/fastCover.h new file mode 100644 index 00000000..958e9f42 --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/fastCover.h @@ -0,0 +1,57 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "mem.h" /* read */ +#include "pool.h" +#include "threading.h" +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ + unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ + unsigned f; /* log of size of frequency array */ + unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ + unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ + double splitPoint; /* Percentage of samples used for training: the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ + ZDICT_params_t zParams; +} ZDICT_fastCover_params_t; + + +/*! ZDICT_optimizeTrainFromBuffer_fastCover(): + * Train a dictionary from an array of samples using a modified version of the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * All of the parameters except for f are optional. + * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. + * if steps is zero it defaults to its default value. + * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. + * + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * On success `*parameters` contains the parameters selected. + */ + ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, + ZDICT_fastCover_params_t *parameters); + + +/*! ZDICT_trainFromBuffer_fastCover(): + * Train a dictionary from an array of samples using a modified version of the COVER algorithm. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * d, k, and f are required. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover( + void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, + const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters); diff --git a/contrib/experimental_dict_builders/fastCover/main.c b/contrib/experimental_dict_builders/fastCover/main.c new file mode 100644 index 00000000..df7d9181 --- /dev/null +++ b/contrib/experimental_dict_builders/fastCover/main.c @@ -0,0 +1,183 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "fastCover.h" +#include "io.h" +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 + + +/*-************************************* +* FASTCOVER +***************************************/ +int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info, + unsigned maxDictSize, + ZDICT_fastCover_params_t *params) { + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + /* Run the optimize version if either k or d is not provided */ + if (!params->d || !params->k) { + dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, params); + } else { + dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + } + DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100)); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _done; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_done: + free(dictBuffer); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + /* Initialize arguments to default values */ + unsigned k = 0; + unsigned d = 0; + unsigned f = 23; + unsigned steps = 32; + unsigned nbThreads = 1; + unsigned split = 100; + const char* outputFile = "fastCoverDict"; + unsigned dictID = 0; + unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; /* follow directory recursively */ + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "d=")) { d = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "f=")) { f = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "steps=")) { steps = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "split=")) { split = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = argument; + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "io.h" +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ + +#define SAMPLESIZE_MAX (128 KB) +#define RANDOM_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((U32)-1) : ((U32)1 GB)) +#define RANDOM_MEMMULT 9 +static const size_t g_maxMemory = (sizeof(size_t) == 4) ? + (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); + +#define NOISELENGTH 32 + + +/*-************************************* +* Commandline related functions +***************************************/ +unsigned readU32FromChar(const char** stringPtr){ + const char errorMsg[] = "error: numeric value too large"; + unsigned result = 0; + while ((**stringPtr >='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + +unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + + +/* ******************************************************** +* File related operations +**********************************************************/ +/** loadFiles() : + * load samples from files listed in fileNamesTable into buffer. + * works even if buffer is too small to load all samples. + * Also provides the size of each sample into sampleSizes table + * which must be sized correctly, using DiB_fileStats(). + * @return : nb of samples effectively loaded into `buffer` + * *bufferSizePtr is modified, it provides the amount data loaded within buffer. + * sampleSizes is filled with the size of each sample. + */ +static unsigned loadFiles(void* buffer, size_t* bufferSizePtr, size_t* sampleSizes, + unsigned sstSize, const char** fileNamesTable, unsigned nbFiles, + size_t targetChunkSize, unsigned displayLevel) { + char* const buff = (char*)buffer; + size_t pos = 0; + unsigned nbLoadedChunks = 0, fileIndex; + + for (fileIndex=0; fileIndex *bufferSizePtr-pos) break; + { size_t const readSize = fread(buff+pos, 1, toLoad, f); + if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName); + pos += readSize; + sampleSizes[nbLoadedChunks++] = toLoad; + remainingToLoad -= targetChunkSize; + if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */ + fileIndex = nbFiles; /* stop there */ + break; + } + if (toLoad < targetChunkSize) { + fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR); + } } } + fclose(f); + } + DISPLAYLEVEL(2, "\r%79s\r", ""); + *bufferSizePtr = pos; + DISPLAYLEVEL(4, "loaded : %u KB \n", (U32)(pos >> 10)) + return nbLoadedChunks; +} + +#define rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 getRand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +/* shuffle() : + * shuffle a table of file names in a semi-random way + * It improves dictionary quality by reducing "locality" impact, so if sample set is very large, + * it will load random elements from it, instead of just the first ones. */ +static void shuffle(const char** fileNamesTable, unsigned nbFiles) { + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = getRand(&seed) % (i + 1); + const char* const tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +size_t findMaxMem(unsigned long long requiredMem) { + size_t const step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += step; + if (requiredMem > g_maxMemory) requiredMem = g_maxMemory; + + while (!testmem) { + testmem = malloc((size_t)requiredMem); + requiredMem -= step; + } + + free(testmem); + return (size_t)requiredMem; +} + +void saveDict(const char* dictFileName, + const void* buff, size_t buffSize) { + FILE* const f = fopen(dictFileName, "wb"); + if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); + + { size_t const n = fwrite(buff, 1, buffSize, f); + if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) } + + { size_t const n = (size_t)fclose(f); + if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) } +} + +/*! getFileStats() : + * Given a list of files, and a chunkSize (0 == no chunk, whole files) + * provides the amount of data to be loaded and the resulting nb of samples. + * This is useful primarily for allocation purpose => sample buffer, and sample sizes table. + */ +static fileStats getFileStats(const char** fileNamesTable, unsigned nbFiles, + size_t chunkSize, unsigned displayLevel) { + fileStats fs; + unsigned n; + memset(&fs, 0, sizeof(fs)); + for (n=0; n 2*SAMPLESIZE_MAX); + fs.nbSamples += nbSamples; + } + DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (U32)(fs.totalSizeToLoad >> 10)); + return fs; +} + + + + +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel) { + fileStats const fs = getFileStats(fileNamesTable, nbFiles, chunkSize, displayLevel); + size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); + size_t const memMult = RANDOM_MEMMULT; + size_t const maxMem = findMaxMem(fs.totalSizeToLoad * memMult) / memMult; + size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad); + void* const srcBuffer = malloc(loadedSize+NOISELENGTH); + + /* Checks */ + if ((!sampleSizes) || (!srcBuffer)) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + if (fs.oneSampleTooLarge) { + DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX); + } + if (fs.nbSamples < 5) { + DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); + DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); + DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n"); + EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */ + } + if (fs.totalSizeToLoad < (unsigned long long)(8 * maxDictSize)) { + DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n"); + DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n"); + } + + /* init */ + if (loadedSize < fs.totalSizeToLoad) + DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20)); + + /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + shuffle(fileNamesTable, nbFiles); + nbFiles = loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, + fileNamesTable, nbFiles, chunkSize, displayLevel); + + sampleInfo *info = (sampleInfo *)malloc(sizeof(sampleInfo)); + + info->nbSamples = fs.nbSamples; + info->samplesSizes = sampleSizes; + info->srcBuffer = srcBuffer; + + return info; +} + + +void freeSampleInfo(sampleInfo *info) { + if (!info) return; + if (info->samplesSizes) free((void*)(info->samplesSizes)); + if (info->srcBuffer) free((void*)(info->srcBuffer)); + free(info); +} diff --git a/contrib/experimental_dict_builders/randomDictBuilder/io.h b/contrib/experimental_dict_builders/randomDictBuilder/io.h new file mode 100644 index 00000000..0ee24604 --- /dev/null +++ b/contrib/experimental_dict_builders/randomDictBuilder/io.h @@ -0,0 +1,60 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "zstd_internal.h" /* includes zstd.h */ +#include "fileio.h" /* stdinmark, stdoutmark, ZSTD_EXTENSION */ +#include "platform.h" /* Large Files support */ +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Structs +***************************************/ +typedef struct { + U64 totalSizeToLoad; + unsigned oneSampleTooLarge; + unsigned nbSamples; +} fileStats; + +typedef struct { + const void* srcBuffer; + const size_t *samplesSizes; + size_t nbSamples; +}sampleInfo; + + + +/*! getSampleInfo(): + * Load from input files and add samples to buffer + * @return: a sampleInfo struct containing infomation about buffer where samples are stored, + * size of each sample, and total number of samples + */ +sampleInfo* getSampleInfo(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, + unsigned maxDictSize, const unsigned displayLevel); + + + +/*! freeSampleInfo(): + * Free memory allocated for info + */ +void freeSampleInfo(sampleInfo *info); + + + +/*! saveDict(): + * Save data stored on buff to dictFileName + */ +void saveDict(const char* dictFileName, const void* buff, size_t buffSize); + + +unsigned readU32FromChar(const char** stringPtr); + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +unsigned longCommandWArg(const char** stringPtr, const char* longCommand); diff --git a/contrib/experimental_dict_builders/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c new file mode 100644 index 00000000..3ad88574 --- /dev/null +++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c @@ -0,0 +1,161 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include "random.h" +#include "io.h" +#include "util.h" +#include "zdict.h" + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_k 200 +#define DEFAULT_OUTPUTFILE "defaultDict" +#define DEFAULT_DICTID 0 + + + +/*-************************************* +* RANDOM +***************************************/ +int RANDOM_trainFromFiles(const char* dictFileName, sampleInfo *info, + unsigned maxDictSize, + ZDICT_random_params_t *params) { + unsigned const displayLevel = params->zParams.notificationLevel; + void* const dictBuffer = malloc(maxDictSize); + + int result = 0; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *params); + DISPLAYLEVEL(2, "k=%u\n", params->k); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _done; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + saveDict(dictFileName, dictBuffer, dictSize); + } + + /* clean up */ +_done: + free(dictBuffer); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + int displayLevel = 2; + const char* programName = argv[0]; + int operationResult = 0; + + /* Initialize arguments to default values */ + unsigned k = DEFAULT_k; + const char* outputFile = DEFAULT_OUTPUTFILE; + unsigned dictID = DEFAULT_DICTID; + unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "k=")) { k = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "dictID=")) { dictID = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "maxdict=")) { maxDictSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + if (longCommandWArg(&argument, "out=")) { + outputFile = argument; + continue; + } + DISPLAYLEVEL(1, "Incorrect parameters\n"); + operationResult = 1; + return operationResult; + } + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + int followLinks = 0; /* follow directory recursively */ + const char** extendedFileList = NULL; + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "random.h" +#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ + if (displayLevel >= l) { \ + if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + g_time = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + } +#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(displayLevel, l, __VA_ARGS__) +static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; + + + +/* ******************************************************** +* Random Dictionary Builder +**********************************************************/ +/** + * Returns the sum of the sample sizes. + */ +static size_t RANDOM_sum(const size_t *samplesSizes, unsigned nbSamples) { + size_t sum = 0; + unsigned i; + for (i = 0; i < nbSamples; ++i) { + sum += samplesSizes[i]; + } + return sum; +} + + +/** + * A segment is an inclusive range in the source. + */ +typedef struct { + U32 begin; + U32 end; +} RANDOM_segment_t; + + +/** + * Selects a random segment from totalSamplesSize - k + 1 possible segments + */ +static RANDOM_segment_t RANDOM_selectSegment(const size_t totalSamplesSize, + ZDICT_random_params_t parameters) { + const U32 k = parameters.k; + RANDOM_segment_t segment; + unsigned index; + + /* Randomly generate a number from 0 to sampleSizes - k */ + index = rand()%(totalSamplesSize - k + 1); + + /* inclusive */ + segment.begin = index; + segment.end = index + k - 1; + + return segment; +} + + +/** + * Check the validity of the parameters. + * Returns non-zero if the parameters are valid and 0 otherwise. + */ +static int RANDOM_checkParameters(ZDICT_random_params_t parameters, + size_t maxDictSize) { + /* k is a required parameter */ + if (parameters.k == 0) { + return 0; + } + /* k <= maxDictSize */ + if (parameters.k > maxDictSize) { + return 0; + } + return 1; +} + + +/** + * Given the prepared context build the dictionary. + */ +static size_t RANDOM_buildDictionary(const size_t totalSamplesSize, const BYTE *samples, + void *dictBuffer, size_t dictBufferCapacity, + ZDICT_random_params_t parameters) { + BYTE *const dict = (BYTE *)dictBuffer; + size_t tail = dictBufferCapacity; + const int displayLevel = parameters.zParams.notificationLevel; + while (tail > 0) { + + /* Select a segment */ + RANDOM_segment_t segment = RANDOM_selectSegment(totalSamplesSize, parameters); + + size_t segmentSize; + segmentSize = MIN(segment.end - segment.begin + 1, tail); + + tail -= segmentSize; + memcpy(dict + tail, samples + segment.begin, segmentSize); + DISPLAYUPDATE( + 2, "\r%u%% ", + (U32)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity)); + } + + return tail; +} + + + + +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( + void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters) { + const int displayLevel = parameters.zParams.notificationLevel; + BYTE* const dict = (BYTE*)dictBuffer; + /* Checks */ + if (!RANDOM_checkParameters(parameters, dictBufferCapacity)) { + DISPLAYLEVEL(1, "k is incorrect\n"); + return ERROR(GENERIC); + } + if (nbSamples == 0) { + DISPLAYLEVEL(1, "Random must have at least one input file\n"); + return ERROR(GENERIC); + } + if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) { + DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n", + ZDICT_DICTSIZE_MIN); + return ERROR(dstSize_tooSmall); + } + const size_t totalSamplesSize = RANDOM_sum(samplesSizes, nbSamples); + const BYTE *const samples = (const BYTE *)samplesBuffer; + + DISPLAYLEVEL(2, "Building dictionary\n"); + { + const size_t tail = RANDOM_buildDictionary(totalSamplesSize, samples, + dictBuffer, dictBufferCapacity, parameters); + const size_t dictSize = ZDICT_finalizeDictionary( + dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail, + samplesBuffer, samplesSizes, nbSamples, parameters.zParams); + if (!ZSTD_isError(dictSize)) { + DISPLAYLEVEL(2, "Constructed dictionary of size %u\n", + (U32)dictSize); + } + return dictSize; + } +} diff --git a/contrib/experimental_dict_builders/randomDictBuilder/random.h b/contrib/experimental_dict_builders/randomDictBuilder/random.h new file mode 100644 index 00000000..352775f9 --- /dev/null +++ b/contrib/experimental_dict_builders/randomDictBuilder/random.h @@ -0,0 +1,29 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* memset */ +#include /* clock */ +#include "zstd_internal.h" /* includes zstd.h */ +#ifndef ZDICT_STATIC_LINKING_ONLY +#define ZDICT_STATIC_LINKING_ONLY +#endif +#include "zdict.h" + + + +typedef struct { + unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+]; Default to 200 */ + ZDICT_params_t zParams; +} ZDICT_random_params_t; + + +/*! ZDICT_trainFromBuffer_random(): + * Train a dictionary from an array of samples. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + */ +ZDICTLIB_API size_t ZDICT_trainFromBuffer_random( void *dictBuffer, size_t dictBufferCapacity, + const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + ZDICT_random_params_t parameters); diff --git a/contrib/experimental_dict_builders/randomDictBuilder/test.sh b/contrib/experimental_dict_builders/randomDictBuilder/test.sh new file mode 100644 index 00000000..1eb732e5 --- /dev/null +++ b/contrib/experimental_dict_builders/randomDictBuilder/test.sh @@ -0,0 +1,14 @@ +echo "Building random dictionary with in=../../lib/common k=200 out=dict1" +./main in=../../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../../lib/common -q +echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" +./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../../lib/common -q +echo "Building random dictionary with 2 sample sources" +./main in=../../../lib/common in=../../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../../lib/common -q +echo "Removing dict1 dict2 dict3" +rm -f dict1 dict2 dict3 + +echo "Testing with invalid parameters, should fail" +! ./main r=10 diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index c6686252..d659baf1 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -3332,9 +3332,11 @@ size_t ZSTD_CStreamOutSize(void) static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, const ZSTD_CDict* const cdict, - ZSTD_CCtx_params const params, unsigned long long const pledgedSrcSize) + ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) { DEBUGLOG(4, "ZSTD_resetCStream_internal"); + /* Finalize the compression parameters */ + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); assert(!((dict) && (cdict))); /* either dict or cdict, not both */ @@ -3363,7 +3365,6 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (U32)pledgedSrcSize); if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; params.fParams.contentSizeFlag = 1; - params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, 0); return ZSTD_resetCStream_internal(zcs, NULL, 0, ZSTD_dct_auto, zcs->cdict, params, pledgedSrcSize); } @@ -3376,6 +3377,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, ZSTD_CCtx_params params, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_internal"); + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); assert(!((dict) && (cdict))); /* either dict or cdict, not both */ @@ -3442,25 +3444,21 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, (U32)pledgedSrcSize, params.fParams.contentSizeFlag); CHECK_F( ZSTD_checkCParams(params.cParams) ); if ((pledgedSrcSize==0) && (params.fParams.contentSizeFlag==0)) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* for compatibility with older programs relying on this behavior. Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. This line will be removed in the future. */ - { ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, cctxParams, pledgedSrcSize); - } + zcs->requestedParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL /*cdict*/, zcs->requestedParams, pledgedSrcSize); } size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize); - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN); + ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel); + return ZSTD_initCStream_internal(zcs, dict, dictSize, NULL, zcs->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN); } size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) { U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; /* temporary : 0 interpreted as "unknown" during transition period. Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. `0` will be interpreted as "empty" in the future */ - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0); - ZSTD_CCtx_params const cctxParams = ZSTD_assignParamsToCCtxParams(zcs->requestedParams, params); - return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, cctxParams, pledgedSrcSize); + ZSTD_CCtxParams_init(&zcs->requestedParams, compressionLevel); + return ZSTD_initCStream_internal(zcs, NULL, 0, NULL, zcs->requestedParams, pledgedSrcSize); } size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) diff --git a/lib/compress/zstdmt_compress.c b/lib/compress/zstdmt_compress.c index 6daedca8..d5193d52 100644 --- a/lib/compress/zstdmt_compress.c +++ b/lib/compress/zstdmt_compress.c @@ -37,7 +37,9 @@ #define ZSTD_RESIZE_SEQPOOL 0 /* ====== Debug ====== */ -#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) && !defined(_MSC_VER) +#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) \ + && !defined(_MSC_VER) \ + && !defined(__MINGW32__) # include # include diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 2024e0bb..b8a51789 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -293,7 +293,7 @@ static dictItem ZDICT_analyzePos( refinedEnd = refinedStart + selectedCount; } - /* evaluate gain based on new ref */ + /* evaluate gain based on new dict */ start = refinedStart; pos = suffix[refinedStart]; end = start; @@ -341,7 +341,7 @@ static dictItem ZDICT_analyzePos( for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) savings[i] = savings[i-1] + (lengthList[i] * (i-3)); - DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", + DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n", (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); solution.pos = (U32)pos; @@ -581,7 +581,7 @@ static void ZDICT_fillNoise(void* buffer, size_t length) typedef struct { - ZSTD_CCtx* ref; /* contains reference to dictionary */ + ZSTD_CDict* dict; /* dictionary */ ZSTD_CCtx* zc; /* working context */ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ } EStats_ress_t; @@ -597,8 +597,9 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, size_t cSize; if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ - { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); - if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } + { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict); + if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } + } cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } @@ -708,14 +709,6 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* init */ DEBUGLOG(4, "ZDICT_analyzeEntropy"); - esr.ref = ZSTD_createCCtx(); - esr.zc = ZSTD_createCCtx(); - esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); - if (!esr.ref || !esr.zc || !esr.workPlace) { - eSize = ERROR(memory_allocation); - DISPLAYLEVEL(1, "Not enough memory \n"); - goto _cleanup; - } if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; @@ -726,12 +719,15 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, memset(bestRepOffset, 0, sizeof(bestRepOffset)); if (compressionLevel==0) compressionLevel = g_compressionLevel_default; params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); - { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); - if (ZSTD_isError(beginResult)) { - DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult)); - eSize = ERROR(GENERIC); - goto _cleanup; - } } + + esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem); + esr.zc = ZSTD_createCCtx(); + esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); + if (!esr.dict || !esr.zc || !esr.workPlace) { + eSize = ERROR(memory_allocation); + DISPLAYLEVEL(1, "Not enough memory \n"); + goto _cleanup; + } /* collect stats on all samples */ for (u=0; u%10u (%5.*f),%6.*f MB/s\r", diff --git a/programs/fileio.c b/programs/fileio.c index b4eed28d..85367fdf 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -797,6 +797,14 @@ FIO_compressZstdFrame(const cRess_t* ressPtr, } } while (directive != ZSTD_e_end); + if (ferror(srcFile)) { + EXM_THROW(26, "Read error : I/O error"); + } + if (fileSize != UTIL_FILESIZE_UNKNOWN && *readsize != fileSize) { + EXM_THROW(27, "Read error : Incomplete read : %llu / %llu B", + (unsigned long long)*readsize, (unsigned long long)fileSize); + } + return compressedfilesize; } diff --git a/programs/zstdcli.c b/programs/zstdcli.c index e0d7807f..36ba2115 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -502,7 +502,7 @@ int main(int argCount, const char* argv[]) if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; } if (!strcmp(argument, "--no-sparse")) { FIO_setSparseWrite(0); continue; } if (!strcmp(argument, "--test")) { operation=zom_test; continue; } - if (!strcmp(argument, "--train")) { operation=zom_train; outFileName=g_defaultDictName; continue; } + if (!strcmp(argument, "--train")) { operation=zom_train; if (outFileName==NULL) outFileName=g_defaultDictName; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; lastCommand=1; continue; } /* kept available for compatibility with old syntax ; will be removed one day */ if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; lastCommand=1; continue; } /* kept available for compatibility with old syntax ; will be removed one day */ if (!strcmp(argument, "--no-dictID")) { FIO_setDictIDFlag(0); continue; } @@ -526,7 +526,8 @@ int main(int argCount, const char* argv[]) #ifndef ZSTD_NODICT if (longCommandWArg(&argument, "--train-cover")) { operation = zom_train; - outFileName = g_defaultDictName; + if (outFileName == NULL) + outFileName = g_defaultDictName; cover = 1; /* Allow optional arguments following an = */ if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); } @@ -536,7 +537,8 @@ int main(int argCount, const char* argv[]) } if (longCommandWArg(&argument, "--train-legacy")) { operation = zom_train; - outFileName = g_defaultDictName; + if (outFileName == NULL) + outFileName = g_defaultDictName; cover = 0; /* Allow optional arguments following an = */ if (*argument == 0) { continue; } @@ -718,7 +720,7 @@ int main(int argCount, const char* argv[]) break; /* Select compressibility of synthetic sample */ - case 'P': + case 'P': { argument++; compressibility = (double)readU32FromChar(&argument) / 100; } @@ -841,7 +843,7 @@ int main(int argCount, const char* argv[]) if (cLevel > ZSTD_maxCLevel()) cLevel = ZSTD_maxCLevel(); if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel(); if (cLevelLast < cLevel) cLevelLast = cLevel; - if (cLevelLast > cLevel) + if (cLevelLast > cLevel) DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast); if(filenameIdx) { if(separateFiles) { @@ -856,7 +858,7 @@ int main(int argCount, const char* argv[]) } else { for(; cLevel <= cLevelLast; cLevel++) { BMK_benchFilesAdvanced(filenameTable, filenameIdx, dictFileName, cLevel, &compressionParams, g_displayLevel, &adv); - } + } } } else { for(; cLevel <= cLevelLast; cLevel++) { diff --git a/tests/playTests.sh b/tests/playTests.sh index fb8b1d24..0a1f96c0 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -404,7 +404,13 @@ $ECHO "Hello World" > tmp $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : not enough input source" ./datagen -P0 -g10M > tmp $ZSTD --train-legacy -q tmp && die "Dictionary training should fail : source is pure noise" -rm tmp* +$ECHO "- Test -o before --train" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train *.c ../programs/*.c +test -f tmpDict +$ZSTD --train *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> cover dictionary builder : advanced options " @@ -425,12 +431,18 @@ $ZSTD --train-cover=k=46,d=8 *.c ../programs/*.c --dictID=1 -o tmpDict1 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" $ZSTD --train-cover=steps=8 *.c ../programs/*.c -o tmpDict2 --maxdict=4K -rm tmp* $ECHO "- Compare size of dictionary from 90% training samples with 80% training samples" $ZSTD --train-cover=split=90 -r *.c ../programs/*.c $ZSTD --train-cover=split=80 -r *.c ../programs/*.c $ECHO "- Create dictionary using all samples for both training and testing" $ZSTD --train-cover=split=100 -r *.c ../programs/*.c +$ECHO "- Test -o before --train-cover" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train-cover *.c ../programs/*.c +test -f tmpDict +$ZSTD --train-cover *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> legacy dictionary builder " @@ -450,7 +462,13 @@ $ZSTD --train-legacy -s5 *.c ../programs/*.c --dictID=1 -o tmpDict1 cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "- Create dictionary with size limit" $ZSTD --train-legacy -s9 *.c ../programs/*.c -o tmpDict2 --maxdict=4K -rm tmp* +$ECHO "- Test -o before --train-legacy" +rm -f tmpDict dictionary +$ZSTD -o tmpDict --train-legacy *.c ../programs/*.c +test -f tmpDict +$ZSTD --train-legacy *.c ../programs/*.c +test -f dictionary +rm tmp* dictionary $ECHO "\n===> integrity tests " diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index 22c49cb3..3d61394a 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -969,6 +969,26 @@ static int basicUnitTests(U32 seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_srcSize sets requestedParams : ", testNb++); + { unsigned level; + CHECK_Z(ZSTD_initCStream_srcSize(zc, 11, ZSTD_CONTENTSIZE_UNKNOWN)); + CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level)); + CHECK(level != 11, "Compression level does not match"); + ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN); + CHECK_Z(ZSTD_CCtx_getParameter(zc, ZSTD_p_compressionLevel, &level)); + CHECK(level != 11, "Compression level does not match"); + } + DISPLAYLEVEL(3, "OK \n"); + + DISPLAYLEVEL(3, "test%3i : ZSTD_initCStream_advanced sets requestedParams : ", testNb++); + { ZSTD_parameters const params = ZSTD_getParams(9, 0, 0); + CHECK_Z(ZSTD_initCStream_advanced(zc, NULL, 0, params, ZSTD_CONTENTSIZE_UNKNOWN)); + CHECK(badParameters(zc, params), "Compression parameters do not match"); + ZSTD_resetCStream(zc, ZSTD_CONTENTSIZE_UNKNOWN); + CHECK(badParameters(zc, params), "Compression parameters do not match"); + } + DISPLAYLEVEL(3, "OK \n"); + /* Overlen overwriting window data bug */ DISPLAYLEVEL(3, "test%3i : wildcopy doesn't overwrite potential match data : ", testNb++); { /* This test has a window size of 1024 bytes and consists of 3 blocks: