Integrated dictionary builder into zstd cli

This commit is contained in:
Yann Collet 2016-02-12 02:31:57 +01:00
parent fb92a78ac1
commit 71eafdd23f
5 changed files with 506 additions and 138 deletions

View File

@ -28,7 +28,7 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at : You can contact the author at :
- Zstd source repository : https://www.zstd.net - Zstd homepage : https://www.zstd.net
*/ */
/*-************************************** /*-**************************************
@ -124,23 +124,6 @@ void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
} }
/*-*************************************
* Exceptions
***************************************/
#ifndef DEBUG
# define DEBUG 0
#endif
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
#define EXM_THROW(error, ...) \
{ \
DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
DISPLAYLEVEL(1, "Error %i : ", error); \
DISPLAYLEVEL(1, __VA_ARGS__); \
DISPLAYLEVEL(1, "\n"); \
exit(error); \
}
/*-******************************************************** /*-********************************************************
* Helper functions * Helper functions
**********************************************************/ **********************************************************/
@ -516,7 +499,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
} }
static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize, static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */ const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
const size_t* fileSizes, unsigned nbFiles, const size_t* fileSizes, unsigned nbFiles,
U32 shiftRatio, unsigned maxDictSize) U32 shiftRatio, unsigned maxDictSize)
@ -528,18 +511,21 @@ static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos)); U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
U32 minRatio = nbFiles >> shiftRatio; U32 minRatio = nbFiles >> shiftRatio;
int divSuftSortResult; int divSuftSortResult;
size_t result = 0;
/* init */ /* init */
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
EXM_THROW(1, "not enough memory for ZDICT_trainBuffer"); result = ERROR(memory_allocation);
goto _cleanup;
}
if (minRatio < MINRATIO) minRatio = MINRATIO; if (minRatio < MINRATIO) minRatio = MINRATIO;
memset(doneMarks, 0, bufferSize+16); memset(doneMarks, 0, bufferSize+16);
/* sort */ /* sort */
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20)); DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0); divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
if (divSuftSortResult != 0) EXM_THROW(2, "sort failed"); if (divSuftSortResult != 0) { result = ERROR(GENERIC); goto _cleanup; }
suffix[bufferSize] = (int)bufferSize; /* leads into noise */ suffix[bufferSize] = (int)bufferSize; /* leads into noise */
suffix0[0] = (int)bufferSize; /* leads into noise */ suffix0[0] = (int)bufferSize; /* leads into noise */
{ {
@ -578,10 +564,12 @@ static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
dictList->pos = n; dictList->pos = n;
} }
_cleanup:
free(suffix0); free(suffix0);
free(reverseSuffix); free(reverseSuffix);
free(doneMarks); free(doneMarks);
free(filePos); free(filePos);
return result;
} }
@ -661,7 +649,11 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
esr.ref = ZSTD_createCCtx(); esr.ref = ZSTD_createCCtx();
esr.zc = ZSTD_createCCtx(); esr.zc = ZSTD_createCCtx();
esr.workPlace = malloc(BLOCKSIZE); esr.workPlace = malloc(BLOCKSIZE);
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory"); if (!esr.ref || !esr.zc || !esr.workPlace) {
eSize = ERROR(memory_allocation);
DISPLAYLEVEL(1, "Not enough memory");
goto _cleanup;
}
if (compressionLevel==0) compressionLevel=g_compressionLevel_default; if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB); params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
params.strategy = ZSTD_greedy; params.strategy = ZSTD_greedy;
@ -677,50 +669,82 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
/* analyze */ /* analyze */
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
if (HUF_isError(errorCode)) EXM_THROW(31, "HUF_buildCTable error"); if (HUF_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "HUF_buildCTable error");
goto _cleanup;
}
huffLog = (U32)errorCode; huffLog = (U32)errorCode;
total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u]; total=0; for (u=0; u<=OFFCODE_MAX; u++) total+=offcodeCount[u];
errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX); errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, OFFCODE_MAX);
if (FSE_isError(errorCode)) EXM_THROW(32, "FSE_normalizeCount error with offcodeCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount");
goto _cleanup;
}
Offlog = (U32)errorCode; Offlog = (U32)errorCode;
total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u];
errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
if (FSE_isError(errorCode)) EXM_THROW(33, "FSE_normalizeCount error with matchLengthCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount");
goto _cleanup;
}
mlLog = (U32)errorCode; mlLog = (U32)errorCode;
total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u]; total=0; for (u=0; u<=MaxLL; u++) total+=litlengthCount[u];
errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL); errorCode = FSE_normalizeCount(litlengthNCount, llLog, litlengthCount, total, MaxLL);
if (FSE_isError(errorCode)) EXM_THROW(34, "FSE_normalizeCount error with litlengthCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_normalizeCount error with litlengthCount");
goto _cleanup;
}
llLog = (U32)errorCode; llLog = (U32)errorCode;
/* write result to buffer */ /* write result to buffer */
errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog); errorCode = HUF_writeCTable(dstBuffer, maxDstSize, hufTable, 255, huffLog);
if (HUF_isError(errorCode)) EXM_THROW(41, "HUF_writeCTable error"); if (HUF_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "HUF_writeCTable error");
goto _cleanup;
}
dstBuffer = (char*)dstBuffer + errorCode; dstBuffer = (char*)dstBuffer + errorCode;
maxDstSize -= errorCode; maxDstSize -= errorCode;
eSize += errorCode; eSize += errorCode;
errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog); errorCode = FSE_writeNCount(dstBuffer, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
if (FSE_isError(errorCode)) EXM_THROW(42, "FSE_writeNCount error with offcodeNCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount");
goto _cleanup;
}
dstBuffer = (char*)dstBuffer + errorCode; dstBuffer = (char*)dstBuffer + errorCode;
maxDstSize -= errorCode; maxDstSize -= errorCode;
eSize += errorCode; eSize += errorCode;
errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog); errorCode = FSE_writeNCount(dstBuffer, maxDstSize, matchLengthNCount, MaxML, mlLog);
if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with matchLengthNCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount");
goto _cleanup;
}
dstBuffer = (char*)dstBuffer + errorCode; dstBuffer = (char*)dstBuffer + errorCode;
maxDstSize -= errorCode; maxDstSize -= errorCode;
eSize += errorCode; eSize += errorCode;
errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog); errorCode = FSE_writeNCount(dstBuffer, maxDstSize, litlengthNCount, MaxLL, llLog);
if (FSE_isError(errorCode)) EXM_THROW(43, "FSE_writeNCount error with litlengthNCount"); if (FSE_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount");
goto _cleanup;
}
dstBuffer = (char*)dstBuffer + errorCode; dstBuffer = (char*)dstBuffer + errorCode;
maxDstSize -= errorCode; maxDstSize -= errorCode;
eSize += errorCode; eSize += errorCode;
/* clean */ _cleanup:
ZSTD_freeCCtx(esr.ref); ZSTD_freeCCtx(esr.ref);
ZSTD_freeCCtx(esr.zc); ZSTD_freeCCtx(esr.zc);
free(esr.workPlace); free(esr.workPlace);
@ -730,12 +754,12 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
#define DIB_FASTSEGMENTSIZE 64 #define DIB_FASTSEGMENTSIZE 64
/*! ZDICT_fastSampling (based on an idea by Giuseppe Ottaviano) /*! ZDICT_fastSampling() (based on an idea proposed by Giuseppe Ottaviano) :
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer` Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`,
up to `dictSize`. up to `dictSize`.
Filling starts from the end of `dictBuffer`, down to maximum possible. Filling starts from the end of `dictBuffer`, down to maximum possible.
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used. if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
@return : amount of data written into `dictBuffer` @return : amount of data written into `dictBuffer`,
or an error code or an error code
*/ */
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize, static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,

View File

@ -84,12 +84,12 @@ default: zstd
all: zstd zstd32 fullbench fullbench32 fuzzer fuzzer32 zbufftest zbufftest32 paramgrill datagen all: zstd zstd32 fullbench fullbench32 fuzzer fuzzer32 zbufftest zbufftest32 paramgrill datagen
zstd : $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \ zstd : $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c $(ZSTDDIR)/dictBuilder.c $(ZSTDDIR)/divsufsort.c \
zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c dibio.c
$(CC) $(FLAGS) $^ -o $@$(EXT) $(CC) $(FLAGS) $^ -o $@$(EXT)
zstd32: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \ zstd32: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c $(ZSTDDIR)/dictBuilder.c $(ZSTDDIR)/divsufsort.c \
zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) bench.c xxhash.c datagen.c dibio.c
$(CC) -m32 $(FLAGS) $^ -o $@$(EXT) $(CC) -m32 $(FLAGS) $^ -o $@$(EXT)
zstd_nolegacy : zstd_nolegacy :
@ -101,12 +101,12 @@ zstd-pgo : clean zstd
rm zstd rm zstd
$(MAKE) zstd MOREFLAGS=-fprofile-use $(MAKE) zstd MOREFLAGS=-fprofile-use
zstd-noBench: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \ zstd-frugal: $(ZSTD_FILES) $(ZSTDDIR)/zstd_buffered.c \
zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY) zstdcli.c fileio.c $(ZSTD_FILEIO_LEGACY)
$(CC) $(FLAGS) -DZSTD_NOBENCH $^ -o zstd$(EXT) $(CC) $(FLAGS) -DZSTD_NOBENCH -DZSTD_NODICT $^ -o zstd$(EXT)
zstd-frugal: clean zstd-small: clean
$(MAKE) zstd-noBench ZSTD_LEGACY_SUPPORT=0 CFLAGS=-Os $(MAKE) zstd-frugal ZSTD_LEGACY_SUPPORT=0
fullbench : $(ZSTD_FILES) \ fullbench : $(ZSTD_FILES) \
datagen.c fullbench.c datagen.c fullbench.c
@ -140,7 +140,7 @@ datagen : datagen.c datagencli.c
$(CC) $(FLAGS) $^ -o $@$(EXT) $(CC) $(FLAGS) $^ -o $@$(EXT)
clean: clean:
@rm -f core *.o tmp* result* *.gcda \ @rm -f core *.o tmp* result* *.gcda dictionary *.zst \
zstd$(EXT) zstd32$(EXT) \ zstd$(EXT) zstd32$(EXT) \
fullbench$(EXT) fullbench32$(EXT) \ fullbench$(EXT) fullbench32$(EXT) \
fuzzer$(EXT) fuzzer32$(EXT) zbufftest$(EXT) zbufftest32$(EXT) \ fuzzer$(EXT) fuzzer32$(EXT) zbufftest$(EXT) zbufftest32$(EXT) \

275
programs/dibio.c Normal file
View File

@ -0,0 +1,275 @@
/*
dibio - I/O API for dictionary builder
Copyright (C) Yann Collet 2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
You can contact the author at :
- zstd homepage : http://www.zstd.net/
*/
/*-**************************************
* Compiler Options
****************************************/
/* Disable some Visual warning messages */
#ifdef _MSC_VER
# define _CRT_SECURE_NO_WARNINGS /* fopen */
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
#endif
/* Unix Large Files support (>4GB) */
#define _FILE_OFFSET_BITS 64
#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
# define _LARGEFILE_SOURCE
#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
# define _LARGEFILE64_SOURCE
#endif
/*-*************************************
* Includes
***************************************/
#include <stdlib.h> /* malloc, free */
#include <string.h> /* memset */
#include <stdio.h> /* fprintf, fopen, ftello64 */
#include <sys/types.h> /* stat64 */
#include <sys/stat.h> /* stat64 */
#include <time.h> /* clock */
#include "mem.h" /* read */
#include "error_private.h"
#include "dictBuilder_static.h"
/*-*************************************
* Compiler specifics
***************************************/
#if !defined(S_ISREG)
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
#endif
/*-*************************************
* Constants
***************************************/
#define KB *(1 <<10)
#define MB *(1 <<20)
#define GB *(1U<<30)
#define DICTLISTSIZE 10000
#define MEMMULT 11
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
#define NOISELENGTH 32
#define PRIME1 2654435761U
#define PRIME2 2246822519U
/*-*************************************
* Console display
***************************************/
#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
void DiB_setNotificationLevel(unsigned l) { g_displayLevel=l; ZDICT_setNotificationLevel(l); }
void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
{
const BYTE* const b = (const BYTE*)ptr;
size_t u;
for (u=0; u<length; u++) {
BYTE c = b[u];
if (c<32 || c>126) c = '.'; /* non-printable char */
DISPLAYLEVEL(dlevel, "%c", c);
}
}
/*-*************************************
* Exceptions
***************************************/
#ifndef DEBUG
# define DEBUG 0
#endif
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
#define EXM_THROW(error, ...) \
{ \
DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
DISPLAYLEVEL(1, "Error %i : ", error); \
DISPLAYLEVEL(1, __VA_ARGS__); \
DISPLAYLEVEL(1, "\n"); \
exit(error); \
}
/* ********************************************************
* Helper functions
**********************************************************/
unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
/* ********************************************************
* File related operations
**********************************************************/
static unsigned long long DiB_getFileSize(const char* infilename)
{
int r;
#if defined(_MSC_VER)
struct _stat64 statbuf;
r = _stat64(infilename, &statbuf);
#else
struct stat statbuf;
r = stat(infilename, &statbuf);
#endif
if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */
return (unsigned long long)statbuf.st_size;
}
static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
{
unsigned long long total = 0;
unsigned n;
for (n=0; n<nbFiles; n++)
total += DiB_getFileSize(fileNamesTable[n]);
return total;
}
static void DiB_loadFiles(void* buffer, size_t bufferSize,
size_t* fileSizes,
const char** fileNamesTable, unsigned nbFiles)
{
char* buff = (char*)buffer;
size_t pos = 0;
unsigned n;
for (n=0; n<nbFiles; n++) {
size_t readSize;
unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
FILE* f = fopen(fileNamesTable[n], "rb");
if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
DISPLAYLEVEL(2, "Loading %s... \r", fileNamesTable[n]);
if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
readSize = fread(buff+pos, 1, (size_t)fileSize, f);
if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
pos += readSize;
fileSizes[n] = (size_t)fileSize;
fclose(f);
}
}
/*-********************************************************
* Dictionary training functions
**********************************************************/
static size_t DiB_findMaxMem(unsigned long long requiredMem)
{
size_t step = 8 MB;
void* testmem = NULL;
requiredMem = (((requiredMem >> 23) + 1) << 23);
requiredMem += 2 * step;
if (requiredMem > maxMemory) requiredMem = maxMemory;
while (!testmem) {
requiredMem -= step;
testmem = malloc((size_t)requiredMem);
}
free(testmem);
return (size_t)(requiredMem - step);
}
static void DiB_fillNoise(void* buffer, size_t length)
{
unsigned acc = PRIME1;
size_t p=0;;
for (p=0; p<length; p++) {
acc *= PRIME2;
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
}
}
static void DiB_saveDict(const char* dictFileName,
const void* buff, size_t buffSize)
{
FILE* f;
size_t n;
f = fopen(dictFileName, "wb");
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
n = fwrite(buff, 1, buffSize, f);
if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName)
n = (size_t)fclose(f);
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
}
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t params)
{
void* srcBuffer;
size_t benchedSize;
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
void* dictBuffer = malloc(maxDictSize);
size_t dictSize;
int result = 0;
/* init */
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
if (benchedSize < totalSizeToLoad)
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
/* Memory allocation & restrictions */
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
/* Load input buffer */
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
/* call buffer version */
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
srcBuffer, fileSizes, nbFiles,
params);
if (ZDICT_isError(dictSize)) {
DISPLAYLEVEL(1, "dictionary training failed : %s", ZDICT_getErrorName(dictSize)); /* should not happen */
result = 1;
goto _cleanup;
}
/* save dict */
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
DiB_saveDict(dictFileName, dictBuffer, dictSize);
/* clean up */
_cleanup:
free(srcBuffer);
free(dictBuffer);
free(fileSizes);
return result;
}

63
programs/dibio.h Normal file
View File

@ -0,0 +1,63 @@
/*
dibio.h - I/O API for dictionary builder
Copyright (C) Yann Collet 2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
You can contact the author at :
- zstd homepage : http://www.zstd.net/
*/
/* This library is designed for a single-threaded console application.
* It exit() and printf() into stderr when it encounters an error condition. */
#ifndef DIBIO_H_003
#define DIBIO_H_003
/*-*************************************
* Dependencies
***************************************/
#include "dictBuilder_static.h" /* ZDICT_params_t */
/*-*************************************
* Public functions
***************************************/
/*! DiB_trainFromFiles() :
Train a dictionary from a set of files provided by `fileNamesTable`.
Resulting dictionary is written into file `dictFileName`.
`parameters` is optional and can be provided with values set to 0, meaning "default".
@return : 0 == ok. Any other : error.
*/
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
const char** fileNamesTable, unsigned nbFiles,
ZDICT_params_t parameters);
/*-*************************************
* Helper functions
***************************************/
/*! DiB_setNotificationLevel
Set amount of notification to be displayed on the console.
default initial value : 0 = no console notification.
Note : not thread-safe (use a global constant)
*/
void DiB_setNotificationLevel(unsigned l);
#endif

View File

@ -22,10 +22,9 @@
- zstd homepage : http://www.zstd.net/ - zstd homepage : http://www.zstd.net/
*/ */
/* /*
Note : this is user program. Note : this is user program, not part of libzstd.
It is not part of zstd compression library. The license of this command line program is GPLv2.
The license of this compression CLI program is GPLv2. The license of libzstd is BSD.
The license of zstd library is BSD.
*/ */
@ -46,7 +45,10 @@
#ifndef ZSTD_NOBENCH #ifndef ZSTD_NOBENCH
# include "bench.h" /* BMK_benchFiles, BMK_SetNbIterations */ # include "bench.h" /* BMK_benchFiles, BMK_SetNbIterations */
#endif #endif
#include "zstd.h" /* ZSTD version numbers */ #include "zstd_static.h" /* ZSTD_maxCLevel, ZSTD version numbers */
#ifndef ZSTD_NODICT
# include "dibio.h" /* BMK_benchFiles, BMK_SetNbIterations */
#endif
/*-************************************ /*-************************************
@ -55,9 +57,6 @@
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
# include <fcntl.h> /* _O_BINARY */ # include <fcntl.h> /* _O_BINARY */
# include <io.h> /* _setmode, _isatty */ # include <io.h> /* _setmode, _isatty */
# ifdef __MINGW32__
/* int _fileno(FILE *stream); // seems no longer useful // MINGW somehow forgets to include this windows declaration into <stdio.h> */
# endif
# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) # define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream)) # define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
#else #else
@ -78,6 +77,7 @@
#endif #endif
#define AUTHOR "Yann Collet" #define AUTHOR "Yann Collet"
#define WELCOME_MESSAGE "*** %s %i-bits %s, by %s ***\n", COMPRESSOR_NAME, (int)(sizeof(void*)*8), ZSTD_VERSION, AUTHOR #define WELCOME_MESSAGE "*** %s %i-bits %s, by %s ***\n", COMPRESSOR_NAME, (int)(sizeof(void*)*8), ZSTD_VERSION, AUTHOR
#define ZSTD_EXTENSION ".zst" #define ZSTD_EXTENSION ".zst"
#define ZSTD_CAT "zstdcat" #define ZSTD_CAT "zstdcat"
#define ZSTD_UNZSTD "unzstd" #define ZSTD_UNZSTD "unzstd"
@ -86,6 +86,11 @@
#define MB *(1 <<20) #define MB *(1 <<20)
#define GB *(1U<<30) #define GB *(1U<<30)
static const char* g_defaultDictName = "dictionary";
static const unsigned g_defaultMaxDictSize = 110 KB;
static const unsigned g_defaultDictCLevel = 5;
static const unsigned g_defaultSelectivityLevel = 9;
/*-************************************ /*-************************************
* Display Macros * Display Macros
@ -96,35 +101,21 @@ static FILE* displayOut;
static unsigned displayLevel = 2; /* 0 : no display, 1: errors, 2 : + result + interaction + warnings, 3 : + progression, 4 : + information */ static unsigned displayLevel = 2; /* 0 : no display, 1: errors, 2 : + result + interaction + warnings, 3 : + progression, 4 : + information */
/*-************************************
* Exceptions
**************************************/
#define DEBUG 0
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
#define EXM_THROW(error, ...) \
{ \
DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
DISPLAYLEVEL(1, "Error %i : ", error); \
DISPLAYLEVEL(1, __VA_ARGS__); \
DISPLAYLEVEL(1, "\n"); \
exit(error); \
}
/*-************************************ /*-************************************
* Command Line * Command Line
**************************************/ **************************************/
static int usage(const char* programName) static int usage(const char* programName)
{ {
DISPLAY( "Usage :\n"); DISPLAY( "Usage :\n");
DISPLAY( " %s [arg] [input] [output]\n", programName); DISPLAY( " %s [args] [FILE(s)] [-o file]\n", programName);
DISPLAY( "\n"); DISPLAY( "\n");
DISPLAY( "input : a filename\n"); DISPLAY( "FILE : a filename\n");
DISPLAY( " with no FILE, or when FILE is - , read standard input\n"); DISPLAY( " with no FILE, or when FILE is - , read standard input\n");
DISPLAY( "Arguments :\n"); DISPLAY( "Arguments :\n");
DISPLAY( " -# : # compression level (1-19, default:1) \n"); DISPLAY( " -# : # compression level (1-%u, default:1) \n", ZSTD_maxCLevel());
DISPLAY( " -d : decompression \n"); DISPLAY( " -d : decompression \n");
DISPLAY( " -D file: use `file` as Dictionary \n"); DISPLAY( " -D file: use `file` as Dictionary \n");
DISPLAY( " -o file: result stored into `file` (only possible if 1 input file) \n");
DISPLAY( " -f : overwrite output without prompting \n"); DISPLAY( " -f : overwrite output without prompting \n");
DISPLAY( " -h/-H : display help/long help and exit\n"); DISPLAY( " -h/-H : display help/long help and exit\n");
return 0; return 0;
@ -139,14 +130,20 @@ static int usage_advanced(const char* programName)
DISPLAY( " -V : display Version number and exit\n"); DISPLAY( " -V : display Version number and exit\n");
DISPLAY( " -v : verbose mode\n"); DISPLAY( " -v : verbose mode\n");
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n"); DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
DISPLAY( " -m : multiple input filenames mode \n");
DISPLAY( " -c : force write to standard output, even if it is the console\n"); DISPLAY( " -c : force write to standard output, even if it is the console\n");
#ifndef ZSTD_NODICT
DISPLAY( "Dictionary builder :\n");
DISPLAY( "--train : Create a dictionary from a set of files \n");
DISPLAY( " -o file: `file` is dictionary name (default: %s) \n", g_defaultDictName);
DISPLAY( "--maxdict:limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize);
DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel);
#endif
#ifndef ZSTD_NOBENCH #ifndef ZSTD_NOBENCH
DISPLAY( "Benchmark arguments :\n"); DISPLAY( "Benchmark arguments :\n");
DISPLAY( " -b# : benchmark file(s), using # compression level (default : 1) \n"); DISPLAY( " -b# : benchmark file(s), using # compression level (default : 1) \n");
DISPLAY( " -B# : cut file into independent blocks of size # (default : no block)\n"); DISPLAY( " -B# : cut file into independent blocks of size # (default: no block)\n");
DISPLAY( " -i# : iteration loops [1-9](default : 3)\n"); DISPLAY( " -i# : iteration loops [1-9](default : 3)\n");
DISPLAY( " -r# : test all compression levels from 1 to # (default : disabled)\n"); DISPLAY( " -r# : test all compression levels from 1 to # (default: disabled)\n");
#endif #endif
return 0; return 0;
} }
@ -176,8 +173,10 @@ int main(int argCount, const char** argv)
forceStdout=0, forceStdout=0,
main_pause=0, main_pause=0,
nextEntryIsDictionary=0, nextEntryIsDictionary=0,
multiple=0, operationResult=0,
operationResult=0; dictBuild=0,
nextArgumentIsOutFileName=0,
nextArgumentIsMaxDict=0;
unsigned cLevel = 1; unsigned cLevel = 1;
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */ const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
unsigned filenameIdx = 0; unsigned filenameIdx = 0;
@ -185,11 +184,13 @@ int main(int argCount, const char** argv)
const char* outFileName = NULL; const char* outFileName = NULL;
const char* dictFileName = NULL; const char* dictFileName = NULL;
char* dynNameSpace = NULL; char* dynNameSpace = NULL;
const char extension[] = ZSTD_EXTENSION;
int rangeBench = 1; int rangeBench = 1;
unsigned maxDictSize = g_defaultMaxDictSize;
unsigned dictCLevel = g_defaultDictCLevel;
unsigned dictSelect = g_defaultSelectivityLevel;
/* init */ /* init */
(void)rangeBench; /* not used when ZSTD_NOBENCH set */ (void)rangeBench; (void)dictCLevel; /* not used when ZSTD_NOBENCH / ZSTD_NODICT set */
if (filenameTable==NULL) { DISPLAY("not enough memory\n"); exit(1); } if (filenameTable==NULL) { DISPLAY("not enough memory\n"); exit(1); }
displayOut = stderr; displayOut = stderr;
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */ /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
@ -208,32 +209,32 @@ int main(int argCount, const char** argv)
/* long commands (--long-word) */ /* long commands (--long-word) */
if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; } if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); } if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
if (!strcmp(argument, "--multiple")) { multiple=1; continue; }
if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; } if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
if (!strcmp(argument, "--quiet")) { displayLevel--; continue; } if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
if (!strcmp(argument, "--train")) { dictBuild=1; outFileName=g_defaultDictName; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
/* '-' means stdin/stdout */
if (!strcmp(argument, "-")){
if (!filenameIdx) { filenameIdx=1, filenameTable[0]=stdinmark; continue; }
outFileName=stdoutmark; continue;
}
/* Decode commands (note : aggregated commands are allowed) */ /* Decode commands (note : aggregated commands are allowed) */
if (argument[0]=='-') { if (argument[0]=='-') {
/* '-' means stdin/stdout */
if (argument[1]==0) {
if (!filenameIdx) { filenameIdx=1, filenameTable[0]=stdinmark; continue; }
outFileName=stdoutmark; continue;
}
argument++; argument++;
while (argument[0]!=0) while (argument[0]!=0) {
{
/* compression Level */ /* compression Level */
if ((*argument>='0') && (*argument<='9')) if ((*argument>='0') && (*argument<='9')) {
{
cLevel = 0; cLevel = 0;
while ((*argument >= '0') && (*argument <= '9')) while ((*argument >= '0') && (*argument <= '9')) {
{
cLevel *= 10; cLevel *= 10;
cLevel += *argument - '0'; cLevel += *argument - '0';
argument++; argument++;
} }
dictCLevel = cLevel;
continue; continue;
} }
@ -247,9 +248,6 @@ int main(int argCount, const char** argv)
/* Decoding */ /* Decoding */
case 'd': decode=1; argument++; break; case 'd': decode=1; argument++; break;
/* Multiple input files */
case 'm': multiple=1; argument++; break;
/* Force stdout, even if stdout==console */ /* Force stdout, even if stdout==console */
case 'c': forceStdout=1; outFileName=stdoutmark; displayLevel=1; argument++; break; case 'c': forceStdout=1; outFileName=stdoutmark; displayLevel=1; argument++; break;
@ -268,6 +266,9 @@ int main(int argCount, const char** argv)
/* keep source file (default anyway, so useless; for gzip/xz compatibility) */ /* keep source file (default anyway, so useless; for gzip/xz compatibility) */
case 'k': argument++; break; case 'k': argument++; break;
/* dictionary name */
case 'o': nextArgumentIsOutFileName=1; argument++; break;
#ifndef ZSTD_NOBENCH #ifndef ZSTD_NOBENCH
/* Benchmark */ /* Benchmark */
case 'b': bench=1; argument++; break; case 'b': bench=1; argument++; break;
@ -304,6 +305,13 @@ int main(int argCount, const char** argv)
break; break;
#endif /* ZSTD_NOBENCH */ #endif /* ZSTD_NOBENCH */
/* Selection level */
case 's': argument++;
dictSelect = 0;
while ((*argument >= '0') && (*argument <= '9'))
dictSelect *= 10, dictSelect += *argument++ - '0';
break;
/* Pause at the end (hidden option) */ /* Pause at the end (hidden option) */
case 'p': main_pause=1; argument++; break; case 'p': main_pause=1; argument++; break;
@ -314,13 +322,29 @@ int main(int argCount, const char** argv)
continue; continue;
} }
/* dictionary */
if (nextEntryIsDictionary) { if (nextEntryIsDictionary) {
nextEntryIsDictionary = 0; nextEntryIsDictionary = 0;
dictFileName = argument; dictFileName = argument;
continue; continue;
} }
if (nextArgumentIsOutFileName) {
nextArgumentIsOutFileName = 0;
outFileName = argument;
if (!strcmp(outFileName, "-")) outFileName = stdoutmark;
continue;
}
if (nextArgumentIsMaxDict) {
nextArgumentIsMaxDict = 0;
maxDictSize = 0;
while ((*argument>='0') && (*argument<='9'))
maxDictSize = maxDictSize * 10 + (*argument - '0'), argument++;
if (*argument=='k' || *argument=='K')
maxDictSize <<= 10;
continue;
}
/* add filename to list */ /* add filename to list */
filenameTable[filenameIdx++] = argument; filenameTable[filenameIdx++] = argument;
} }
@ -336,65 +360,47 @@ int main(int argCount, const char** argv)
goto _end; goto _end;
} }
/* Check if dictionary builder is selected */
if (dictBuild) {
#ifndef ZSTD_NODICT
ZDICT_params_t dictParams;
dictParams.compressionLevel = dictCLevel;
dictParams.selectivityLevel = dictSelect;
DiB_setNotificationLevel(displayLevel);
DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams);
#endif
goto _end;
}
/* No input filename ==> use stdin */ /* No input filename ==> use stdin */
if(!filenameIdx) filenameIdx=1, filenameTable[0]=stdinmark; if(!filenameIdx) filenameIdx=1, filenameTable[0]=stdinmark;
/* Check if input defined as console; trigger an error in this case */ /* Check if input/output defined as console; trigger an error in this case */
if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) return badusage(programName); if (!strcmp(filenameTable[0], stdinmark) && IS_CONSOLE(stdin) ) return badusage(programName);
if (outFileName && !strcmp(outFileName, stdoutmark) && IS_CONSOLE(stdout) && !forceStdout) return badusage(programName);
/* No output filename ==> try to select one automatically (when possible) */ /* No warning message in pipe mode (stdin + stdout) or multiple mode */
if (filenameIdx>=2) outFileName = filenameTable[1];
while (!outFileName) { /* while : just to allow break statement */
if (!IS_CONSOLE(stdout)) { outFileName=stdoutmark; break; } /* Default to stdout whenever possible (i.e. not a console) */
if (!decode) { /* compression to file */
size_t l = strlen(filenameTable[0]);
dynNameSpace = (char*)calloc(1,l+5);
if (dynNameSpace==NULL) { DISPLAY("not enough memory\n"); exit(1); }
strcpy(dynNameSpace, filenameTable[0]);
strcpy(dynNameSpace+l, ZSTD_EXTENSION);
outFileName = dynNameSpace;
DISPLAYLEVEL(2, "Compressed filename will be : %s \n", outFileName);
break;
}
/* decompression to file (automatic name will work only if input filename has correct format extension) */
{
size_t filenameSize = strlen(filenameTable[0]);
if (strcmp(filenameTable[0] + (filenameSize-4), extension)) {
DISPLAYLEVEL(1, "unknown suffix - cannot determine destination filename\n");
return badusage(programName);
}
dynNameSpace = (char*)calloc(1,filenameSize+1);
if (dynNameSpace==NULL) { DISPLAY("not enough memory\n"); exit(1); }
outFileName = dynNameSpace;
strcpy(dynNameSpace, filenameTable[0]);
dynNameSpace[filenameSize-4]=0;
DISPLAYLEVEL(2, "Decoding file %s \n", outFileName);
} }
/* Check if output is defined as console; trigger an error in this case */
if (!strcmp(outFileName,stdoutmark) && IS_CONSOLE(stdout) && !forceStdout) return badusage(programName);
/* No warning message in pure pipe mode (stdin + stdout) or multiple mode */
if (!strcmp(filenameTable[0], stdinmark) && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1; if (!strcmp(filenameTable[0], stdinmark) && !strcmp(outFileName,stdoutmark) && (displayLevel==2)) displayLevel=1;
if (multiple && (displayLevel==2)) displayLevel=1; if ((filenameIdx>1) && (displayLevel==2)) displayLevel=1;
if ((!multiple) && (filenameIdx>2)) { /* user-selected output filename only possible with a single file */
DISPLAY("Too many files on the command line (%u > 2). Do you mean -m ? \n", filenameIdx); if ((outFileName) && (filenameIdx>1)) {
DISPLAY("Too many files (%u) on the command line. \n", filenameIdx);
return filenameIdx; return filenameIdx;
} }
/* IO Stream/File */ /* IO Stream/File */
FIO_setNotificationLevel(displayLevel); FIO_setNotificationLevel(displayLevel);
if (decode) { if (decode) {
if (multiple) if (outFileName)
operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName);
else
operationResult = FIO_decompressFilename(outFileName, filenameTable[0], dictFileName); operationResult = FIO_decompressFilename(outFileName, filenameTable[0], dictFileName);
else
operationResult = FIO_decompressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName);
} else { /* compression */ } else { /* compression */
if (multiple) if (outFileName)
operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName, cLevel);
else
operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel); operationResult = FIO_compressFilename(outFileName, filenameTable[0], dictFileName, cLevel);
else
operationResult = FIO_compressMultipleFilenames(filenameTable, filenameIdx, ZSTD_EXTENSION, dictFileName, cLevel);
} }
_end: _end: