diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 1d373225..be141ce1 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -819,10 +819,10 @@ size_t ZDICT_trainFromBuffer_unsafe( ZDICT_params_t params) { U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); - dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); + dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); unsigned selectivity = params.selectivityLevel; unsigned compressionLevel = params.compressionLevel; - size_t targetDictSize = maxDictSize; + size_t const targetDictSize = maxDictSize; size_t sBuffSize; size_t dictSize = 0; @@ -865,17 +865,16 @@ size_t ZDICT_trainFromBuffer_unsafe( /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); size_t hSize; - BYTE* ptr; - U32 u; /* build dict content */ - ptr = (BYTE*)dictBuffer + maxDictSize; - for (u=1; upos; u++) { - U32 l = dictList[u].length; - ptr -= l; - if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */ - memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); - } + { U32 u; + BYTE* ptr = (BYTE*)dictBuffer + maxDictSize; + for (u=1; upos; u++) { + U32 l = dictList[u].length; + ptr -= l; + if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */ + memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); + } } /* fast mode dict content */ if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */ @@ -888,7 +887,8 @@ size_t ZDICT_trainFromBuffer_unsafe( /* dictionary header */ MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); { U64 const randomID = XXH64((char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize, 0); - MEM_writeLE32((char*)dictBuffer+4, (U32)(randomID>>11)); + U32 const dictID = params.dictID ? params.dictID : (U32)(randomID>>11); + MEM_writeLE32((char*)dictBuffer+4, dictID); } hSize = 8; diff --git a/lib/dictBuilder/zdict_static.h b/lib/dictBuilder/zdict_static.h index e5f909ac..e34e6c07 100644 --- a/lib/dictBuilder/zdict_static.h +++ b/lib/dictBuilder/zdict_static.h @@ -54,7 +54,8 @@ typedef struct { unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */ unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */ unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned reserved[3]; /* space for future parameters */ + unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ + unsigned reserved[2]; /* space for future parameters */ } ZDICT_params_t; @@ -65,7 +66,7 @@ typedef struct { Same as ZDICT_trainFromBuffer() with control over more parameters. `parameters` is optional and can be provided with values set to 0 to mean "default". @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`) - or an error code, which can be tested by DiB_isError(). + or an error code, which can be tested by ZDICT_isError(). note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel() */ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, diff --git a/programs/dibio.c b/programs/dibio.c index 23f3c817..d23476e3 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -101,27 +101,30 @@ const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCo /* ******************************************************** * File related operations **********************************************************/ -static void DiB_loadFiles(void* buffer, size_t bufferSize, - size_t* fileSizes, - const char** fileNamesTable, unsigned nbFiles) +/** DiB_loadFiles() : +* @return : nb of files effectively loaded into `buffer` */ +static unsigned DiB_loadFiles(void* buffer, size_t bufferSize, + size_t* fileSizes, + const char** fileNamesTable, unsigned nbFiles) { - char* buff = (char*)buffer; + char* const buff = (char*)buffer; size_t pos = 0; unsigned n; for (n=0; n bufferSize-pos ? 0 : fs64); + FILE* const f = fopen(fileNamesTable[n], "rb"); if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]); DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[n]); - if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */ - readSize = fread(buff+pos, 1, (size_t)fileSize, f); - if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]); - pos += readSize; - fileSizes[n] = (size_t)fileSize; + { size_t const readSize = fread(buff+pos, 1, fileSize, f); + if (readSize != fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]); + pos += readSize; } + fileSizes[n] = fileSize; fclose(f); + if (fileSize == 0) break; /* stop there, not enough memory to load all files */ } + return n; } @@ -130,7 +133,7 @@ static void DiB_loadFiles(void* buffer, size_t bufferSize, **********************************************************/ static size_t DiB_findMaxMem(unsigned long long requiredMem) { - size_t step = 8 MB; + size_t const step = 8 MB; void* testmem = NULL; requiredMem = (((requiredMem >> 23) + 1) << 23); @@ -162,7 +165,7 @@ static void DiB_fillNoise(void* buffer, size_t length) static void DiB_saveDict(const char* dictFileName, const void* buff, size_t buffSize) { - FILE* f = fopen(dictFileName, "wb"); + FILE* const f = fopen(dictFileName, "wb"); if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); { size_t const n = fwrite(buff, 1, buffSize, f); @@ -185,47 +188,44 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity, ZDICT_params_t parameters); +#define MIN(a,b) ((a)<(b)?(a):(b)) int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, const char** fileNamesTable, unsigned nbFiles, ZDICT_params_t params) { - void* srcBuffer; - size_t benchedSize; - size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); - unsigned long long totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles); - void* dictBuffer = malloc(maxDictSize); - size_t dictSize; + void* const dictBuffer = malloc(maxDictSize); + size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); + unsigned long long const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles); + size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT; + size_t const benchedSize = MIN (maxMem, (size_t)totalSizeToLoad); + void* const srcBuffer = malloc(benchedSize+NOISELENGTH); int result = 0; + /* Checks */ + if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + /* init */ g_displayLevel = params.notificationLevel; - benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT; - if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad; if (benchedSize < totalSizeToLoad) DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); - /* Memory allocation & restrictions */ - srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */ - if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ - /* Load input buffer */ - DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); + nbFiles = DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ - /* call buffer version */ - dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize, - srcBuffer, fileSizes, nbFiles, - params); - if (ZDICT_isError(dictSize)) { - DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ - result = 1; - goto _cleanup; + { size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize, + srcBuffer, fileSizes, nbFiles, + params); + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + result = 1; + goto _cleanup; + } + /* save dict */ + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + DiB_saveDict(dictFileName, dictBuffer, dictSize); } - /* save dict */ - DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - DiB_saveDict(dictFileName, dictBuffer, dictSize); - /* clean up */ _cleanup: free(srcBuffer); diff --git a/programs/tests/playTests.sh b/programs/tests/playTests.sh index 17290523..ae782303 100755 --- a/programs/tests/playTests.sh +++ b/programs/tests/playTests.sh @@ -129,6 +129,8 @@ $ZSTD -d tmp -D tmpDict -of result diff zstdcli.c result $ZSTD --train *.c *.h -o tmpDictC $ZSTD -d tmp -D tmpDictC -of result && die "wrong dictionary not detected!" +$ZSTD --train *.c --dictID 1 -o tmpDict1 +cmp tmpDict tmpDict1 && die "dictionaries should have different ID !" $ECHO "\n**** multiple files tests **** " diff --git a/programs/zstd.1 b/programs/zstd.1 index 27d607f5..1bab57ab 100644 --- a/programs/zstd.1 +++ b/programs/zstd.1 @@ -18,11 +18,11 @@ .PP .B unzstd is equivalent to -.BR "zstd \-d" +.BR "zstd \-d" .br .B zstdcat is equivalent to -.BR "zstd \-dc" +.BR "zstd \-dc" .br .SH DESCRIPTION @@ -90,7 +90,15 @@ Typical gains range from ~10% (at 64KB) to x5 better (at <1KB). dictionary saved into `file` (default: dictionary) .TP .B \--maxdict # - limit dictionary to specified size (default : 112640) + limit dictionary to specified size (default : 112640) +.TP +.B \--dictID # + A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary. + By default, zstd will create a 4-bytes random number ID. + It's possible to give a precise number instead. + Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, + and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default. + However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries. .TP .B \-s# dictionary selectivity level (default: 9) diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 68dd98cb..74f3878b 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -143,6 +143,7 @@ static int usage_advanced(const char* programName) DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize); DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel); + DISPLAY( "--dictID: force dictionary ID to specified value (default: random)\n"); #endif #ifndef ZSTD_NOBENCH DISPLAY( "\n"); @@ -185,7 +186,8 @@ int main(int argCount, const char** argv) operationResult=0, dictBuild=0, nextArgumentIsOutFileName=0, - nextArgumentIsMaxDict=0; + nextArgumentIsMaxDict=0, + nextArgumentIsDictID=0; unsigned cLevel = 1; unsigned cLevelLast = 1; unsigned recursive = 0; @@ -196,6 +198,7 @@ int main(int argCount, const char** argv) const char* dictFileName = NULL; char* dynNameSpace = NULL; unsigned maxDictSize = g_defaultMaxDictSize; + unsigned dictID = 0; unsigned dictCLevel = g_defaultDictCLevel; unsigned dictSelect = g_defaultSelectivityLevel; #ifdef UTIL_HAS_CREATEFILELIST @@ -236,6 +239,7 @@ int main(int argCount, const char** argv) if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; } if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } + if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; continue; } if (!strcmp(argument, "--keep")) { continue; } /* does nothing, since preserving input is default; for gzip/xz compatibility */ if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; } if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; } @@ -393,6 +397,14 @@ int main(int argCount, const char** argv) continue; } + if (nextArgumentIsDictID) { + nextArgumentIsDictID = 0; + dictID = 0; + while ((*argument>='0') && (*argument<='9')) + dictID = dictID * 10 + (*argument - '0'), argument++; + continue; + } + /* add filename to list */ filenameTable[filenameIdx++] = argument; } @@ -429,6 +441,7 @@ int main(int argCount, const char** argv) dictParams.compressionLevel = dictCLevel; dictParams.selectivityLevel = dictSelect; dictParams.notificationLevel = displayLevel; + dictParams.dictID = dictID; DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams); #endif goto _end;