From f332ece468f2368001c6e593c8f2dade56cadd8e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 23 Mar 2017 16:24:02 -0700 Subject: [PATCH] dictBuilder fails to create dictionary on certain input Properly expressed with an error code (see zstd_errors.h) and a cli return code != 0 --- lib/common/error_private.c | 1 + lib/common/zstd_errors.h | 1 + lib/dictBuilder/zdict.c | 8 +++++--- programs/zstdcli.c | 4 ++-- tests/playTests.sh | 5 +++++ 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/common/error_private.c b/lib/common/error_private.c index a0fa1724..44ae2010 100644 --- a/lib/common/error_private.c +++ b/lib/common/error_private.c @@ -37,6 +37,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; case PREFIX(maxCode): default: return notErrorCode; } diff --git a/lib/common/zstd_errors.h b/lib/common/zstd_errors.h index 949dbd0f..3d579d96 100644 --- a/lib/common/zstd_errors.h +++ b/lib/common/zstd_errors.h @@ -57,6 +57,7 @@ typedef enum { ZSTD_error_maxSymbolValue_tooSmall, ZSTD_error_dictionary_corrupted, ZSTD_error_dictionary_wrong, + ZSTD_error_dictionaryCreation_failed, ZSTD_error_maxCode } ZSTD_ErrorCode; diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index c8419579..842167db 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -62,8 +62,9 @@ #define MINRATIO 4 static const int g_compressionLevel_default = 6; static const U32 g_selectivity_default = 9; -static const size_t g_provision_entropySize = 200; +static const size_t g_provision_entropySize = 192; static const size_t g_min_fast_dictContent = 192; +static const size_t g_dictContentSize_min = 32; /*-************************************* @@ -929,8 +930,8 @@ size_t ZDICT_trainFromBuffer_unsafe( /* checks */ if (!dictList) return ERROR(memory_allocation); - if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } - if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */ + if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ + if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ /* init */ ZDICT_initDictItem(dictList); @@ -963,6 +964,7 @@ size_t ZDICT_trainFromBuffer_unsafe( /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); + if (dictContentSize < g_dictContentSize_min) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ if (dictContentSize < targetDictSize/3) { DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); if (minRep > MINRATIO) { diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 4d7fbb35..281301bd 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -629,7 +629,7 @@ int main(int argCount, const char* argv[]) coverParams.compressionLevel = dictCLevel; coverParams.notificationLevel = g_displayLevel; coverParams.dictID = dictID; - DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1); } else { ZDICT_params_t dictParams; memset(&dictParams, 0, sizeof(dictParams)); @@ -637,7 +637,7 @@ int main(int argCount, const char* argv[]) dictParams.selectivityLevel = dictSelect; dictParams.notificationLevel = g_displayLevel; dictParams.dictID = dictID; - DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0); + operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0); } #endif goto _end; diff --git a/tests/playTests.sh b/tests/playTests.sh index e98e0f44..897a9015 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -281,6 +281,11 @@ case "$UNAME" in *) $MD5SUM -c tmph1 ;; esac rm -rf dirTestDict +$ECHO "- dictionary builder on bogus input" +$ECHO "Hello World" > tmp +$ZSTD --train -q tmp && die "Dictionary training should fail : not enough input source" +./datagen -P0 -g10M > tmp +$ZSTD --train -q tmp && die "Dictionary training should fail : source is pure noise" rm tmp*