minor comment refactor
This commit is contained in:
parent
335780c427
commit
77c137b3ae
@ -375,7 +375,7 @@ static int isIncluded(const void* in, const void* container, size_t length)
|
|||||||
return u==length;
|
return u==length;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! ZDICT_checkMerge
|
/*! ZDICT_tryMerge() :
|
||||||
check if dictItem can be merged, do it if possible
|
check if dictItem can be merged, do it if possible
|
||||||
@return : id of destination elt, 0 if not merged
|
@return : id of destination elt, 0 if not merged
|
||||||
*/
|
*/
|
||||||
@ -440,8 +440,8 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const
|
|||||||
|
|
||||||
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
||||||
{
|
{
|
||||||
/* convention : first element is nb of elts */
|
/* convention : table[0].pos stores nb of elts */
|
||||||
U32 const max = table->pos;
|
U32 const max = table[0].pos;
|
||||||
U32 u;
|
U32 u;
|
||||||
if (!id) return; /* protection, should never happen */
|
if (!id) return; /* protection, should never happen */
|
||||||
for (u=id; u<max-1; u++)
|
for (u=id; u<max-1; u++)
|
||||||
|
@ -44,7 +44,7 @@
|
|||||||
#define SAMPLESIZE_MAX (128 KB)
|
#define SAMPLESIZE_MAX (128 KB)
|
||||||
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
|
#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
|
||||||
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
|
#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
|
||||||
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
|
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
|
||||||
|
|
||||||
#define NOISELENGTH 32
|
#define NOISELENGTH 32
|
||||||
|
|
||||||
@ -98,7 +98,9 @@ const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCo
|
|||||||
* File related operations
|
* File related operations
|
||||||
**********************************************************/
|
**********************************************************/
|
||||||
/** DiB_loadFiles() :
|
/** DiB_loadFiles() :
|
||||||
* @return : nb of files effectively loaded into `buffer` */
|
* load files listed in fileNamesTable into buffer, even if buffer is too small.
|
||||||
|
* @return : nb of files effectively loaded into `buffer`
|
||||||
|
* *bufferSizePtr is modified, it provides the amount data loaded within buffer */
|
||||||
static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
|
static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
|
||||||
size_t* fileSizes,
|
size_t* fileSizes,
|
||||||
const char** fileNamesTable, unsigned nbFiles)
|
const char** fileNamesTable, unsigned nbFiles)
|
||||||
@ -139,13 +141,16 @@ static U32 DiB_rand(U32* src)
|
|||||||
return rand32 >> 5;
|
return rand32 >> 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* DiB_shuffle() :
|
||||||
|
* shuffle a table of file names in a semi-random way
|
||||||
|
* It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
|
||||||
|
* it will load random elements from it, instead of just the first ones. */
|
||||||
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
|
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
|
||||||
/* Initialize the pseudorandom number generator */
|
|
||||||
U32 seed = 0xFD2FB528;
|
U32 seed = 0xFD2FB528;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
for (i = nbFiles - 1; i > 0; --i) {
|
for (i = nbFiles - 1; i > 0; --i) {
|
||||||
unsigned const j = DiB_rand(&seed) % (i + 1);
|
unsigned const j = DiB_rand(&seed) % (i + 1);
|
||||||
const char* tmp = fileNamesTable[j];
|
const char* const tmp = fileNamesTable[j];
|
||||||
fileNamesTable[j] = fileNamesTable[i];
|
fileNamesTable[j] = fileNamesTable[i];
|
||||||
fileNamesTable[i] = tmp;
|
fileNamesTable[i] = tmp;
|
||||||
}
|
}
|
||||||
@ -162,7 +167,7 @@ static size_t DiB_findMaxMem(unsigned long long requiredMem)
|
|||||||
|
|
||||||
requiredMem = (((requiredMem >> 23) + 1) << 23);
|
requiredMem = (((requiredMem >> 23) + 1) << 23);
|
||||||
requiredMem += step;
|
requiredMem += step;
|
||||||
if (requiredMem > maxMemory) requiredMem = maxMemory;
|
if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
|
||||||
|
|
||||||
while (!testmem) {
|
while (!testmem) {
|
||||||
testmem = malloc((size_t)requiredMem);
|
testmem = malloc((size_t)requiredMem);
|
||||||
@ -203,7 +208,7 @@ static void DiB_saveDict(const char* dictFileName,
|
|||||||
|
|
||||||
|
|
||||||
static int g_tooLargeSamples = 0;
|
static int g_tooLargeSamples = 0;
|
||||||
static U64 DiB_getTotalCappedFileSize(const char** fileNamesTable, unsigned nbFiles)
|
static U64 DiB_totalCappedFileSize(const char** fileNamesTable, unsigned nbFiles)
|
||||||
{
|
{
|
||||||
U64 total = 0;
|
U64 total = 0;
|
||||||
unsigned n;
|
unsigned n;
|
||||||
@ -236,7 +241,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
|||||||
{
|
{
|
||||||
void* const dictBuffer = malloc(maxDictSize);
|
void* const dictBuffer = malloc(maxDictSize);
|
||||||
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||||
unsigned long long const totalSizeToLoad = DiB_getTotalCappedFileSize(fileNamesTable, nbFiles);
|
unsigned long long const totalSizeToLoad = DiB_totalCappedFileSize(fileNamesTable, nbFiles);
|
||||||
size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
|
size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
|
||||||
size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * memMult) / memMult;
|
size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * memMult) / memMult;
|
||||||
size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad);
|
size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad);
|
||||||
@ -246,8 +251,9 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
|||||||
/* Checks */
|
/* Checks */
|
||||||
if (params) g_displayLevel = params->zParams.notificationLevel;
|
if (params) g_displayLevel = params->zParams.notificationLevel;
|
||||||
else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
|
else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
|
||||||
else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */
|
else EXM_THROW(13, "Neither dictionary algorithm selected"); /* should not happen */
|
||||||
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer))
|
||||||
|
EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||||
if (g_tooLargeSamples) {
|
if (g_tooLargeSamples) {
|
||||||
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
|
DISPLAYLEVEL(2, "! Warning : some samples are very large \n");
|
||||||
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
|
DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n");
|
||||||
@ -270,8 +276,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
|||||||
DiB_shuffle(fileNamesTable, nbFiles);
|
DiB_shuffle(fileNamesTable, nbFiles);
|
||||||
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
|
nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||||
|
|
||||||
{
|
{ size_t dictSize;
|
||||||
size_t dictSize;
|
|
||||||
if (params) {
|
if (params) {
|
||||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
|
dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
|
||||||
@ -285,8 +290,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
|||||||
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
|
DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
dictSize =
|
dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
|
||||||
ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
|
|
||||||
fileSizes, nbFiles, *coverParams);
|
fileSizes, nbFiles, *coverParams);
|
||||||
}
|
}
|
||||||
if (ZDICT_isError(dictSize)) {
|
if (ZDICT_isError(dictSize)) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user