Don't shrink window log when streaming with a dictionary

Fixes #2442.

1. When creating a dictionary keep the same behavior as before.
   Assume the source size is 513 bytes when adjusting parameters.
2. When calling ZSTD_getCParams() or ZSTD_adjustCParams() keep
   the same behavior as before.
3. When attaching a dictionary keep the same behavior of ignoring
   the dictionary size. When streaming this will select the
   largest parameters and not adjust them down. But, the CDict
   will use the correctly sized parameters, which seems like the
   right tradeoff.
4. When not attaching a dictionary (either forced not to, or
   using a prefix dictionary) we select parameters based on the
   dictionary size + source size, and assume the source size is
   small, which is the same behavior as before. But, now we don't
   adjust the window log (and hash and chain log) down when the
   source size is unknown.

When the source size is unknown all cdicts should attach, except
when the user disables attaching, or `forceWindow` is used. This
means that when streaming with a CDict we end up in the good case
where we get small CDict parameters, and large source parameters.

TODO: Add a streaming + dictionary regression test case.
This commit is contained in:
Nick Terrell 2021-01-04 13:43:47 -08:00
parent a98a6e2091
commit 9d31c704d5
2 changed files with 32 additions and 16 deletions

View File

@ -1188,15 +1188,30 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
assert(ZSTD_checkCParams(cPar)==0);
if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
srcSize = minSrcSize;
switch (mode) {
case ZSTD_cpm_noAttachDict:
/* If we don't know the source size, don't make any
* assumptions about it. We will already have selected
* smaller parameters if a dictionary is in use.
*/
break;
case ZSTD_cpm_unknown:
/* Keep the legacy behavior of assuming small source
* sizes when the cparam mode is unkown.
*/
/* fall-through */
case ZSTD_cpm_createCDict:
/* Assume a small source size when creating a dictionary
* with an unkown source size.
*/
if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
srcSize = minSrcSize;
break;
case ZSTD_cpm_attachDict:
/* Dictionary has its own dedicated parameters which have
* already been selected. We are selecting parameters
* for only the source.
*/
dictSize = 0;
break;
default:
@ -1213,7 +1228,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
ZSTD_highbit32(tSize-1) + 1;
if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
}
{ U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize);
U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1;
if (cycleLog > dictAndWindowLog)

View File

@ -216,7 +216,7 @@ github.tar, level 16 with dict, zstdcli,
github.tar, level 19, zstdcli, 32841
github.tar, level 19 with dict, zstdcli, 32899
github.tar, no source size, zstdcli, 38442
github.tar, no source size with dict, zstdcli, 213380
github.tar, no source size with dict, zstdcli, 38004
github.tar, long distance mode, zstdcli, 39680
github.tar, multithreaded, zstdcli, 38445
github.tar, multithreaded long distance mode, zstdcli, 39680
@ -618,7 +618,7 @@ github.tar, level 16 with dict, advanced
github.tar, level 19, advanced streaming, 32837
github.tar, level 19 with dict, advanced streaming, 32895
github.tar, no source size, advanced streaming, 38438
github.tar, no source size with dict, advanced streaming, 214381
github.tar, no source size with dict, advanced streaming, 38000
github.tar, long distance mode, advanced streaming, 39676
github.tar, multithreaded, advanced streaming, 38441
github.tar, multithreaded long distance mode, advanced streaming, 39676
@ -695,7 +695,7 @@ github, level 16 with dict, old stre
github, level 19, old streaming, 134064
github, level 19 with dict, old streaming, 37576
github, no source size, old streaming, 140632
github, no source size with dict, old streaming, 40678
github, no source size with dict, old streaming, 40654
github, uncompressed literals, old streaming, 136335
github, uncompressed literals optimal, old streaming, 134064
github, huffman literals, old streaming, 175568
@ -728,7 +728,7 @@ github.tar, level 16 with dict, old stre
github.tar, level 19, old streaming, 32837
github.tar, level 19 with dict, old streaming, 32895
github.tar, no source size, old streaming, 38438
github.tar, no source size with dict, old streaming, 214384
github.tar, no source size with dict, old streaming, 38000
github.tar, uncompressed literals, old streaming, 38441
github.tar, uncompressed literals optimal, old streaming, 32837
github.tar, huffman literals, old streaming, 42465
@ -813,7 +813,7 @@ github, level 16 with dict, old stre
github, level 19, old streaming advanced, 134064
github, level 19 with dict, old streaming advanced, 37576
github, no source size, old streaming advanced, 140632
github, no source size with dict, old streaming advanced, 40643
github, no source size with dict, old streaming advanced, 40608
github, long distance mode, old streaming advanced, 141104
github, multithreaded, old streaming advanced, 141104
github, multithreaded long distance mode, old streaming advanced, 141104
@ -854,7 +854,7 @@ github.tar, level 16 with dict, old stre
github.tar, level 19, old streaming advanced, 32837
github.tar, level 19 with dict, old streaming advanced, 32876
github.tar, no source size, old streaming advanced, 38438
github.tar, no source size with dict, old streaming advanced, 214384
github.tar, no source size with dict, old streaming advanced, 38015
github.tar, long distance mode, old streaming advanced, 38441
github.tar, multithreaded, old streaming advanced, 38441
github.tar, multithreaded long distance mode, old streaming advanced, 38441
@ -880,7 +880,7 @@ github, level 9 with dict, old stre
github, level 13 with dict, old streaming cdcit, 39743
github, level 16 with dict, old streaming cdcit, 37577
github, level 19 with dict, old streaming cdcit, 37576
github, no source size with dict, old streaming cdcit, 40678
github, no source size with dict, old streaming cdcit, 40654
github.tar, level -5 with dict, old streaming cdcit, 45018
github.tar, level -3 with dict, old streaming cdcit, 41886
github.tar, level -1 with dict, old streaming cdcit, 41636
@ -895,7 +895,7 @@ github.tar, level 9 with dict, old stre
github.tar, level 13 with dict, old streaming cdcit, 36372
github.tar, level 16 with dict, old streaming cdcit, 39353
github.tar, level 19 with dict, old streaming cdcit, 32676
github.tar, no source size with dict, old streaming cdcit, 214384
github.tar, no source size with dict, old streaming cdcit, 38000
github, level -5 with dict, old streaming advanced cdict, 49562
github, level -3 with dict, old streaming advanced cdict, 44956
github, level -1 with dict, old streaming advanced cdict, 42383
@ -910,7 +910,7 @@ github, level 9 with dict, old stre
github, level 13 with dict, old streaming advanced cdict, 39731
github, level 16 with dict, old streaming advanced cdict, 40789
github, level 19 with dict, old streaming advanced cdict, 37576
github, no source size with dict, old streaming advanced cdict, 40643
github, no source size with dict, old streaming advanced cdict, 40608
github.tar, level -5 with dict, old streaming advanced cdict, 44307
github.tar, level -3 with dict, old streaming advanced cdict, 41359
github.tar, level -1 with dict, old streaming advanced cdict, 41322
@ -925,4 +925,4 @@ github.tar, level 9 with dict, old stre
github.tar, level 13 with dict, old streaming advanced cdict, 36035
github.tar, level 16 with dict, old streaming advanced cdict, 38736
github.tar, level 19 with dict, old streaming advanced cdict, 32876
github.tar, no source size with dict, old streaming advanced cdict, 214384
github.tar, no source size with dict, old streaming advanced cdict, 38015

1 Data Config Method Total compressed size
216 github.tar level 19 zstdcli 32841
217 github.tar level 19 with dict zstdcli 32899
218 github.tar no source size zstdcli 38442
219 github.tar no source size with dict zstdcli 213380 38004
220 github.tar long distance mode zstdcli 39680
221 github.tar multithreaded zstdcli 38445
222 github.tar multithreaded long distance mode zstdcli 39680
618 github.tar level 19 advanced streaming 32837
619 github.tar level 19 with dict advanced streaming 32895
620 github.tar no source size advanced streaming 38438
621 github.tar no source size with dict advanced streaming 214381 38000
622 github.tar long distance mode advanced streaming 39676
623 github.tar multithreaded advanced streaming 38441
624 github.tar multithreaded long distance mode advanced streaming 39676
695 github level 19 old streaming 134064
696 github level 19 with dict old streaming 37576
697 github no source size old streaming 140632
698 github no source size with dict old streaming 40678 40654
699 github uncompressed literals old streaming 136335
700 github uncompressed literals optimal old streaming 134064
701 github huffman literals old streaming 175568
728 github.tar level 19 old streaming 32837
729 github.tar level 19 with dict old streaming 32895
730 github.tar no source size old streaming 38438
731 github.tar no source size with dict old streaming 214384 38000
732 github.tar uncompressed literals old streaming 38441
733 github.tar uncompressed literals optimal old streaming 32837
734 github.tar huffman literals old streaming 42465
813 github level 19 old streaming advanced 134064
814 github level 19 with dict old streaming advanced 37576
815 github no source size old streaming advanced 140632
816 github no source size with dict old streaming advanced 40643 40608
817 github long distance mode old streaming advanced 141104
818 github multithreaded old streaming advanced 141104
819 github multithreaded long distance mode old streaming advanced 141104
854 github.tar level 19 old streaming advanced 32837
855 github.tar level 19 with dict old streaming advanced 32876
856 github.tar no source size old streaming advanced 38438
857 github.tar no source size with dict old streaming advanced 214384 38015
858 github.tar long distance mode old streaming advanced 38441
859 github.tar multithreaded old streaming advanced 38441
860 github.tar multithreaded long distance mode old streaming advanced 38441
880 github level 13 with dict old streaming cdcit 39743
881 github level 16 with dict old streaming cdcit 37577
882 github level 19 with dict old streaming cdcit 37576
883 github no source size with dict old streaming cdcit 40678 40654
884 github.tar level -5 with dict old streaming cdcit 45018
885 github.tar level -3 with dict old streaming cdcit 41886
886 github.tar level -1 with dict old streaming cdcit 41636
895 github.tar level 13 with dict old streaming cdcit 36372
896 github.tar level 16 with dict old streaming cdcit 39353
897 github.tar level 19 with dict old streaming cdcit 32676
898 github.tar no source size with dict old streaming cdcit 214384 38000
899 github level -5 with dict old streaming advanced cdict 49562
900 github level -3 with dict old streaming advanced cdict 44956
901 github level -1 with dict old streaming advanced cdict 42383
910 github level 13 with dict old streaming advanced cdict 39731
911 github level 16 with dict old streaming advanced cdict 40789
912 github level 19 with dict old streaming advanced cdict 37576
913 github no source size with dict old streaming advanced cdict 40643 40608
914 github.tar level -5 with dict old streaming advanced cdict 44307
915 github.tar level -3 with dict old streaming advanced cdict 41359
916 github.tar level -1 with dict old streaming advanced cdict 41322
925 github.tar level 13 with dict old streaming advanced cdict 36035
926 github.tar level 16 with dict old streaming advanced cdict 38736
927 github.tar level 19 with dict old streaming advanced cdict 32876
928 github.tar no source size with dict old streaming advanced cdict 214384 38015