Added : ability to manually select the dictionary ID of a newly created dictionary

This commit is contained in:
Yann Collet 2016-05-30 21:18:52 +02:00
parent 815580a538
commit 290aaa7521
6 changed files with 81 additions and 57 deletions

View File

@ -819,10 +819,10 @@ size_t ZDICT_trainFromBuffer_unsafe(
ZDICT_params_t params) ZDICT_params_t params)
{ {
U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
unsigned selectivity = params.selectivityLevel; unsigned selectivity = params.selectivityLevel;
unsigned compressionLevel = params.compressionLevel; unsigned compressionLevel = params.compressionLevel;
size_t targetDictSize = maxDictSize; size_t const targetDictSize = maxDictSize;
size_t sBuffSize; size_t sBuffSize;
size_t dictSize = 0; size_t dictSize = 0;
@ -865,17 +865,16 @@ size_t ZDICT_trainFromBuffer_unsafe(
/* create dictionary */ /* create dictionary */
{ U32 dictContentSize = ZDICT_dictSize(dictList); { U32 dictContentSize = ZDICT_dictSize(dictList);
size_t hSize; size_t hSize;
BYTE* ptr;
U32 u;
/* build dict content */ /* build dict content */
ptr = (BYTE*)dictBuffer + maxDictSize; { U32 u;
BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
for (u=1; u<dictList->pos; u++) { for (u=1; u<dictList->pos; u++) {
U32 l = dictList[u].length; U32 l = dictList[u].length;
ptr -= l; ptr -= l;
if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */ if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
} } }
/* fast mode dict content */ /* fast mode dict content */
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */ if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
@ -888,7 +887,8 @@ size_t ZDICT_trainFromBuffer_unsafe(
/* dictionary header */ /* dictionary header */
MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
{ U64 const randomID = XXH64((char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize, 0); { U64 const randomID = XXH64((char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize, 0);
MEM_writeLE32((char*)dictBuffer+4, (U32)(randomID>>11)); U32 const dictID = params.dictID ? params.dictID : (U32)(randomID>>11);
MEM_writeLE32((char*)dictBuffer+4, dictID);
} }
hSize = 8; hSize = 8;

View File

@ -54,7 +54,8 @@ typedef struct {
unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */ unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */
unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */ unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned reserved[3]; /* space for future parameters */ unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */
unsigned reserved[2]; /* space for future parameters */
} ZDICT_params_t; } ZDICT_params_t;
@ -65,7 +66,7 @@ typedef struct {
Same as ZDICT_trainFromBuffer() with control over more parameters. Same as ZDICT_trainFromBuffer() with control over more parameters.
`parameters` is optional and can be provided with values set to 0 to mean "default". `parameters` is optional and can be provided with values set to 0 to mean "default".
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`) @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`)
or an error code, which can be tested by DiB_isError(). or an error code, which can be tested by ZDICT_isError().
note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel() note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel()
*/ */
size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,

View File

@ -101,27 +101,30 @@ const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCo
/* ******************************************************** /* ********************************************************
* File related operations * File related operations
**********************************************************/ **********************************************************/
static void DiB_loadFiles(void* buffer, size_t bufferSize, /** DiB_loadFiles() :
* @return : nb of files effectively loaded into `buffer` */
static unsigned DiB_loadFiles(void* buffer, size_t bufferSize,
size_t* fileSizes, size_t* fileSizes,
const char** fileNamesTable, unsigned nbFiles) const char** fileNamesTable, unsigned nbFiles)
{ {
char* buff = (char*)buffer; char* const buff = (char*)buffer;
size_t pos = 0; size_t pos = 0;
unsigned n; unsigned n;
for (n=0; n<nbFiles; n++) { for (n=0; n<nbFiles; n++) {
size_t readSize; unsigned long long const fs64 = UTIL_getFileSize(fileNamesTable[n]);
unsigned long long fileSize = UTIL_getFileSize(fileNamesTable[n]); size_t const fileSize = (size_t)(fs64 > bufferSize-pos ? 0 : fs64);
FILE* f = fopen(fileNamesTable[n], "rb"); FILE* const f = fopen(fileNamesTable[n], "rb");
if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]); if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[n]); DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[n]);
if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */ { size_t const readSize = fread(buff+pos, 1, fileSize, f);
readSize = fread(buff+pos, 1, (size_t)fileSize, f); if (readSize != fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]); pos += readSize; }
pos += readSize; fileSizes[n] = fileSize;
fileSizes[n] = (size_t)fileSize;
fclose(f); fclose(f);
if (fileSize == 0) break; /* stop there, not enough memory to load all files */
} }
return n;
} }
@ -130,7 +133,7 @@ static void DiB_loadFiles(void* buffer, size_t bufferSize,
**********************************************************/ **********************************************************/
static size_t DiB_findMaxMem(unsigned long long requiredMem) static size_t DiB_findMaxMem(unsigned long long requiredMem)
{ {
size_t step = 8 MB; size_t const step = 8 MB;
void* testmem = NULL; void* testmem = NULL;
requiredMem = (((requiredMem >> 23) + 1) << 23); requiredMem = (((requiredMem >> 23) + 1) << 23);
@ -162,7 +165,7 @@ static void DiB_fillNoise(void* buffer, size_t length)
static void DiB_saveDict(const char* dictFileName, static void DiB_saveDict(const char* dictFileName,
const void* buff, size_t buffSize) const void* buff, size_t buffSize)
{ {
FILE* f = fopen(dictFileName, "wb"); FILE* const f = fopen(dictFileName, "wb");
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName); if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
{ size_t const n = fwrite(buff, 1, buffSize, f); { size_t const n = fwrite(buff, 1, buffSize, f);
@ -185,35 +188,32 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
ZDICT_params_t parameters); ZDICT_params_t parameters);
#define MIN(a,b) ((a)<(b)?(a):(b))
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles, const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t params) ZDICT_params_t params)
{ {
void* srcBuffer; void* const dictBuffer = malloc(maxDictSize);
size_t benchedSize; size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); unsigned long long const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles);
unsigned long long totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles); size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
void* dictBuffer = malloc(maxDictSize); size_t const benchedSize = MIN (maxMem, (size_t)totalSizeToLoad);
size_t dictSize; void* const srcBuffer = malloc(benchedSize+NOISELENGTH);
int result = 0; int result = 0;
/* Checks */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
/* init */ /* init */
g_displayLevel = params.notificationLevel; g_displayLevel = params.notificationLevel;
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
if (benchedSize < totalSizeToLoad) if (benchedSize < totalSizeToLoad)
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
/* Memory allocation & restrictions */
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
/* Load input buffer */ /* Load input buffer */
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); nbFiles = DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
/* call buffer version */ { size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles, srcBuffer, fileSizes, nbFiles,
params); params);
if (ZDICT_isError(dictSize)) { if (ZDICT_isError(dictSize)) {
@ -221,10 +221,10 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
result = 1; result = 1;
goto _cleanup; goto _cleanup;
} }
/* save dict */ /* save dict */
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
DiB_saveDict(dictFileName, dictBuffer, dictSize); DiB_saveDict(dictFileName, dictBuffer, dictSize);
}
/* clean up */ /* clean up */
_cleanup: _cleanup:

View File

@ -129,6 +129,8 @@ $ZSTD -d tmp -D tmpDict -of result
diff zstdcli.c result diff zstdcli.c result
$ZSTD --train *.c *.h -o tmpDictC $ZSTD --train *.c *.h -o tmpDictC
$ZSTD -d tmp -D tmpDictC -of result && die "wrong dictionary not detected!" $ZSTD -d tmp -D tmpDictC -of result && die "wrong dictionary not detected!"
$ZSTD --train *.c --dictID 1 -o tmpDict1
cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
$ECHO "\n**** multiple files tests **** " $ECHO "\n**** multiple files tests **** "

View File

@ -92,6 +92,14 @@ Typical gains range from ~10% (at 64KB) to x5 better (at <1KB).
.B \--maxdict # .B \--maxdict #
limit dictionary to specified size (default : 112640) limit dictionary to specified size (default : 112640)
.TP .TP
.B \--dictID #
A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary.
By default, zstd will create a 4-bytes random number ID.
It's possible to give a precise number instead.
Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header,
and an ID < 65536 will only need 2 bytes. This compares favorably to 4 bytes default.
However, it's up to the dictionary manager to not assign twice the same ID to 2 different dictionaries.
.TP
.B \-s# .B \-s#
dictionary selectivity level (default: 9) dictionary selectivity level (default: 9)
the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size. the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size.

View File

@ -143,6 +143,7 @@ static int usage_advanced(const char* programName)
DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize); DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel); DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
DISPLAY( "--dictID: force dictionary ID to specified value (default: random)\n");
#endif #endif
#ifndef ZSTD_NOBENCH #ifndef ZSTD_NOBENCH
DISPLAY( "\n"); DISPLAY( "\n");
@ -185,7 +186,8 @@ int main(int argCount, const char** argv)
operationResult=0, operationResult=0,
dictBuild=0, dictBuild=0,
nextArgumentIsOutFileName=0, nextArgumentIsOutFileName=0,
nextArgumentIsMaxDict=0; nextArgumentIsMaxDict=0,
nextArgumentIsDictID=0;
unsigned cLevel = 1; unsigned cLevel = 1;
unsigned cLevelLast = 1; unsigned cLevelLast = 1;
unsigned recursive = 0; unsigned recursive = 0;
@ -196,6 +198,7 @@ int main(int argCount, const char** argv)
const char* dictFileName = NULL; const char* dictFileName = NULL;
char* dynNameSpace = NULL; char* dynNameSpace = NULL;
unsigned maxDictSize = g_defaultMaxDictSize; unsigned maxDictSize = g_defaultMaxDictSize;
unsigned dictID = 0;
unsigned dictCLevel = g_defaultDictCLevel; unsigned dictCLevel = g_defaultDictCLevel;
unsigned dictSelect = g_defaultSelectivityLevel; unsigned dictSelect = g_defaultSelectivityLevel;
#ifdef UTIL_HAS_CREATEFILELIST #ifdef UTIL_HAS_CREATEFILELIST
@ -236,6 +239,7 @@ int main(int argCount, const char** argv)
if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; } if (!strcmp(argument, "--test")) { decode=1; outFileName=nulmark; FIO_overwriteMode(); continue; }
if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; } if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
if (!strcmp(argument, "--dictID")) { nextArgumentIsDictID=1; continue; }
if (!strcmp(argument, "--keep")) { continue; } /* does nothing, since preserving input is default; for gzip/xz compatibility */ if (!strcmp(argument, "--keep")) { continue; } /* does nothing, since preserving input is default; for gzip/xz compatibility */
if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; } if (!strcmp(argument, "--ultra")) { FIO_setMaxWLog(0); continue; }
if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; } if (!strcmp(argument, "--sparse")) { FIO_setSparseWrite(2); continue; }
@ -393,6 +397,14 @@ int main(int argCount, const char** argv)
continue; continue;
} }
if (nextArgumentIsDictID) {
nextArgumentIsDictID = 0;
dictID = 0;
while ((*argument>='0') && (*argument<='9'))
dictID = dictID * 10 + (*argument - '0'), argument++;
continue;
}
/* add filename to list */ /* add filename to list */
filenameTable[filenameIdx++] = argument; filenameTable[filenameIdx++] = argument;
} }
@ -429,6 +441,7 @@ int main(int argCount, const char** argv)
dictParams.compressionLevel = dictCLevel; dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect; dictParams.selectivityLevel = dictSelect;
dictParams.notificationLevel = displayLevel; dictParams.notificationLevel = displayLevel;
dictParams.dictID = dictID;
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams); DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
#endif #endif
goto _end; goto _end;