Limit train samples (#2809)

* Limit training samples size to 2GB * simplified DISPLAYLEVEL() macro to use global vqriable instead of local. * refactored training samples loading * fixed compiler warning * addressed comments from the pull request * addressed @terrelln comments * missed some fixes * fixed type mismatch * Fixed bug passing estimated number of samples rather insted of the loaded number of samples. Changed unit conversion not to use bit-shifts. * fixed a declaration after code * fixed type conversion compile errors * fixed more type castting * fixed more type mismatching * changed sizes type to size_t * move type casting * more type cast fixes
2021-10-04 17:47:52 -07:00 · 2021-10-04 17:47:52 -07:00 · 52598d54e9
commit 52598d54e9
parent 7868f38019
5 changed files with 167 additions and 82 deletions
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@ -40,6 +40,13 @@
 /*-*************************************
 *  Constants
 ***************************************/
 /**
 * There are 32bit indexes used to ref samples, so limit samples size to 4GB
 * on 64bit builds.
 * For 32bit builds we choose 1 GB.
 * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
 * contiguous buffer, so 1GB is already a high limit.
 */
 #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
 #define COVER_DEFAULT_SPLITPOINT 1.0
--- a/lib/dictBuilder/fastcover.c
+++ b/lib/dictBuilder/fastcover.c
@ -32,6 +32,13 @@
 /*-*************************************
 *  Constants
 ***************************************/
 /**
 * There are 32bit indexes used to ref samples, so limit samples size to 4GB
 * on 64bit builds.
 * For 32bit builds we choose 1 GB.
 * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
 * contiguous buffer, so 1GB is already a high limit.
 */
 #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
 #define FASTCOVER_MAX_F 31
 #define FASTCOVER_MAX_ACCEL 10
--- a/programs/dibio.c
+++ b/programs/dibio.c
@ -49,6 +49,7 @@
 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 #define NOISELENGTH 32
 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
 /*-*************************************
@ -88,6 +89,15 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 #undef MIN
 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
 /**
  Returns the size of a file.
  If error returns -1.
 */
 static S64 DiB_getFileSize (const char * fileName)
 {
    U64 const fileSize = UTIL_getFileSize(fileName);
    return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
 }
 /* ********************************************************
 *  File related operations
@ -101,47 +111,67 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
 *  sampleSizes is filled with the size of each sample.
 */
-static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
+static int DiB_loadFiles(
-                              size_t* sampleSizes, unsigned sstSize,
+    void* buffer, size_t* bufferSizePtr,
-                              const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
+    size_t* sampleSizes, int sstSize,
-                              unsigned displayLevel)
+    const char** fileNamesTable, int nbFiles,
    size_t targetChunkSize, int displayLevel )
 {
    char* const buff = (char*)buffer;
-    size_t pos = 0;
+    size_t totalDataLoaded = 0;
-    unsigned nbLoadedChunks = 0, fileIndex;
+    int nbSamplesLoaded = 0;
    int fileIndex = 0;
    FILE * f = NULL;
-    for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
+    assert(targetChunkSize <= SAMPLESIZE_MAX);
-        const char* const fileName = fileNamesTable[fileIndex];
+
-        unsigned long long const fs64 = UTIL_getFileSize(fileName);
+    while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
-        unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
+        size_t fileDataLoaded;
-        U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
+        S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
-        U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
+        if (fileSize <= 0) /* skip if zero-size or file error */
-        size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
+            continue;
-        U32 cnb;
+
-        FILE* const f = fopen(fileName, "rb");
+        f = fopen( fileNamesTable[fileIndex], "rb");
-        if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
+        if (f == NULL)
-        DISPLAYUPDATE(2, "Loading %s...       \r", fileName);
+            EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
-        for (cnb=0; cnb<nbChunks; cnb++) {
+        DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
-            size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
+
-            if (toLoad > *bufferSizePtr-pos) break;
+        /* Load the first chunk of data from the file */
-            {   size_t const readSize = fread(buff+pos, 1, toLoad, f);
+        fileDataLoaded = targetChunkSize > 0 ?
-                if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
+                            (size_t)MIN(fileSize, (S64)targetChunkSize) :
-                pos += readSize;
+                            (size_t)MIN(fileSize, SAMPLESIZE_MAX );
-                sampleSizes[nbLoadedChunks++] = toLoad;
+        if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
                remainingToLoad -= targetChunkSize;
                if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
                    fileIndex = nbFiles;  /* stop there */
            break;
        if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
            EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
        sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
        totalDataLoaded += fileDataLoaded;
        /* If file-chunking is enabled, load the rest of the file as more samples */
        if (targetChunkSize > 0) {
            while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
                size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
                if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
                    break;
                if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
                    EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
                sampleSizes[nbSamplesLoaded++] = chunkSize;
                totalDataLoaded += chunkSize;
                fileDataLoaded += chunkSize;
            }
-                if (toLoad < targetChunkSize) {
+        }
-                    fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
+        fileIndex += 1;
-        }   }   }
+        fclose(f); f = NULL;
    }
    if (f != NULL)
        fclose(f);
-    }
+
    DISPLAYLEVEL(2, "\r%79s\r", "");
-    *bufferSizePtr = pos;
+    DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
-    DISPLAYLEVEL(4, "loaded : %u KB \n", (unsigned)(pos >> 10))
+        (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
-    return nbLoadedChunks;
+    *bufferSizePtr = totalDataLoaded;
    return nbSamplesLoaded;
 }
 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
@ -223,11 +253,10 @@ static void DiB_saveDict(const char* dictFileName,
      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
 }
 typedef struct {
-    U64 totalSizeToLoad;
+    S64 totalSizeToLoad;
-    unsigned oneSampleTooLarge;
+    int nbSamples;
-    unsigned nbSamples;
+    int oneSampleTooLarge;
 } fileStats;
 /*! DiB_fileStats() :
@ -235,45 +264,86 @@ typedef struct {
 *  provides the amount of data to be loaded and the resulting nb of samples.
 *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
 */
-static fileStats DiB_fileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
+static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
 {
    fileStats fs;
-    unsigned n;
+    int n;
    memset(&fs, 0, sizeof(fs));
    // We assume that if chunking is requsted, the chunk size is < SAMPLESIZE_MAX
    assert( chunkSize <= SAMPLESIZE_MAX );
    for (n=0; n<nbFiles; n++) {
-        U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
+      S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
-        U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
+      // TODO: is there a minimum sample size? What if the file is 1-byte?
-        U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
+      if (fileSize == 0) {
-        U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
+        DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
-        size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
+        continue;
        fs.totalSizeToLoad += cappedChunkSize * nbSamples;
        fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
        fs.nbSamples += nbSamples;
      }
-    DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (unsigned)(fs.totalSizeToLoad >> 10));
+
      /* the case where we are breaking up files in sample chunks */
      if (chunkSize > 0)
      {
        // TODO: is there a minimum sample size? Can we have a 1-byte sample?
        fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
        fs.totalSizeToLoad += fileSize;
      }
      else {
      /* the case where one file is one sample */
        if (fileSize > SAMPLESIZE_MAX) {
          /* flag excessively large sample files */
          fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
          /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
          DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
              fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
        }
        fs.nbSamples += 1;
        fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
      }
    }
    DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
    return fs;
 }
-
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
 {
-    unsigned const displayLevel = params ? params->zParams.notificationLevel :
+    fileStats fs;
-                        coverParams ? coverParams->zParams.notificationLevel :
+    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
-                        fastCoverParams ? fastCoverParams->zParams.notificationLevel :
+    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
-                        0;   /* should never happen */
+    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
    void* srcBuffer /* contiguous buffer with training data/samples */;
    void* const dictBuffer = malloc(maxDictSize);
-    fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
+    int result = 0;
-    size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
+
-    size_t const memMult = params ? MEMMULT :
+    int const displayLevel = params ? params->zParams.notificationLevel :
        coverParams ? coverParams->zParams.notificationLevel :
        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
    /* Shuffle input files before we start assessing how much sample datA to load.
       The purpose of the shuffle is to pick random samples when the sample
       set is larger than what we can load in memory. */
    DISPLAYLEVEL(3, "Shuffling input files\n");
    DiB_shuffle(fileNamesTable, nbFiles);
    /* Figure out how much sample data to load with how many samples */
    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
    {
        int const memMult = params ? MEMMULT :
                            coverParams ? COVER_MEMMULT:
                            FASTCOVER_MEMMULT;
        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
-    size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
+        /* Limit the size of the training data to the free memory */
-    void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
+        /* Limit the size of the training data to 2GB */
-    int result = 0;
+        /* TODO: there is oportunity to stop DiB_fileStats() early when the data limit is reached */
        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
        srcBuffer = malloc(loadedSize+NOISELENGTH);
        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
    }
    /* Checks */
    if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
@ -289,31 +359,32 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
    }
-    if (fs.totalSizeToLoad < (unsigned long long)maxDictSize * 8) {
+    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
    }
    /* init */
-    if (loadedSize < fs.totalSizeToLoad)
+    if ((S64)loadedSize < fs.totalSizeToLoad)
-        DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
+        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
            (unsigned)(fs.totalSizeToLoad / (1 MB)),
            (unsigned)(loadedSize / (1 MB)));
    /* Load input buffer */
-    DISPLAYLEVEL(3, "Shuffling input files\n");
+    nbSamplesLoaded = DiB_loadFiles(
-    DiB_shuffle(fileNamesTable, nbFiles);
+        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
-
+        nbFiles, chunkSize, displayLevel);
    DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
    {   size_t dictSize;
        if (params) {
            DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
-                                                    srcBuffer, sampleSizes, fs.nbSamples,
+                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
                                                    *params);
        } else if (coverParams) {
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
-                                                             srcBuffer, sampleSizes, fs.nbSamples,
+                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
                                                             coverParams);
              if (!ZDICT_isError(dictSize)) {
                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
@ -322,13 +393,13 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                                     sampleSizes, fs.nbSamples, *coverParams);
+                                                     sampleSizes, nbSamplesLoaded, *coverParams);
            }
        } else {
            assert(fastCoverParams != NULL);
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
-                                                              srcBuffer, sampleSizes, fs.nbSamples,
+                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
                                                              fastCoverParams);
              if (!ZDICT_isError(dictSize)) {
                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
@ -338,7 +409,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
-                                                        sampleSizes, fs.nbSamples, *fastCoverParams);
+                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
            }
        }
        if (ZDICT_isError(dictSize)) {
--- a/programs/dibio.h
+++ b/programs/dibio.h
@ -31,8 +31,8 @@
    `parameters` is optional and can be provided with values set to 0, meaning "default".
    @return : 0 == ok. Any other : error.
 */
-int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
+int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
-                       const char** fileNamesTable, unsigned nbFiles, size_t chunkSize,
+                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -1290,18 +1290,18 @@ int main(int argCount, const char* argv[])
            int const optimize = !coverParams.k || !coverParams.d;
            coverParams.nbThreads = (unsigned)nbWorkers;
            coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
        } else if (dict == fastCover) {
            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
            fastCoverParams.nbThreads = (unsigned)nbWorkers;
            fastCoverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
        } else {
            ZDICT_legacy_params_t dictParams;
            memset(&dictParams, 0, sizeof(dictParams));
            dictParams.selectivityLevel = dictSelect;
            dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (unsigned)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
        }
 #else
        (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */