From d3364aa39e2243638de9ead10699ccd35e0926eb Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 20 Feb 2018 14:48:09 -0800 Subject: [PATCH 1/3] improve benchmark measurement for small inputs by invoking time() once per batch, instead of once per compression / decompression. Batch is dynamically resized so that each round lasts approximately 1 second. Also : increases time accuracy to nanosecond --- programs/bench.c | 71 ++++++++++++++++++++++++++++-------------------- programs/util.h | 12 +++++++- 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/programs/bench.c b/programs/bench.c index bf3dcb47..e67c886b 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -22,7 +22,7 @@ * Compiler Warnings ****************************************/ #ifdef _MSC_VER -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #endif @@ -34,6 +34,7 @@ #include /* malloc, free */ #include /* memset */ #include /* fprintf, fopen */ +#include /* assert */ #include "mem.h" #define ZSTD_STATIC_LINKING_ONLY @@ -51,8 +52,9 @@ # define ZSTD_GIT_COMMIT_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_GIT_COMMIT) #endif -#define TIMELOOP_MICROSEC 1*1000000ULL /* 1 second */ -#define ACTIVEPERIOD_MICROSEC 70*1000000ULL /* 70 seconds */ +#define TIMELOOP_MICROSEC (1*1000000ULL) /* 1 second */ +#define TIMELOOP_NANOSEC (1*1000000000ULL) /* 1 second */ +#define ACTIVEPERIOD_MICROSEC (70*TIMELOOP_MICROSEC) /* 70 seconds */ #define COOLPERIOD_SEC 10 #define KB *(1 <<10) @@ -264,7 +266,9 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, { U64 fastestC = (U64)(-1LL), fastestD = (U64)(-1LL); U64 const crcOrig = g_decodeOnly ? 0 : XXH64(srcBuffer, srcSize, 0); UTIL_time_t coolTime; - U64 const maxTime = (g_nbSeconds * TIMELOOP_MICROSEC) + 1; + U64 const maxTime = (g_nbSeconds * TIMELOOP_NANOSEC) + 1; + U32 nbDecodeLoops = (U32)((100 MB) / (srcSize+1)) + 1; /* initial conservative speed estimate */ + U32 nbCompressionLoops = (U32)((2 MB) / (srcSize+1)) + 1; /* initial conservative speed estimate */ U64 totalCTime=0, totalDTime=0; U32 cCompleted=g_decodeOnly, dCompleted=0; # define NB_MARKS 4 @@ -283,18 +287,16 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, } if (!g_decodeOnly) { - UTIL_time_t clockStart; /* Compression */ DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->\r", marks[markNb], displayName, (U32)srcSize); if (!cCompleted) memset(compressedBuffer, 0xE5, maxCompressedSize); /* warm up and erase result buffer */ - UTIL_sleepMilli(1); /* give processor time to other processes */ + UTIL_sleepMilli(5); /* give processor time to other processes */ UTIL_waitForNextTick(); - clockStart = UTIL_getTime(); if (!cCompleted) { /* still some time to do compression tests */ - U64 const clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1; U32 nbLoops = 0; + UTIL_time_t const clockStart = UTIL_getTime(); ZSTD_CCtx_setParameter(ctx, ZSTD_p_nbWorkers, g_nbWorkers); ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionLevel, cLevel); ZSTD_CCtx_setParameter(ctx, ZSTD_p_enableLongDistanceMatching, g_ldmFlag); @@ -314,7 +316,9 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, ZSTD_CCtx_setParameter(ctx, ZSTD_p_targetLength, comprParams->targetLength); ZSTD_CCtx_setParameter(ctx, ZSTD_p_compressionStrategy, comprParams->strategy); ZSTD_CCtx_loadDictionary(ctx, dictBuffer, dictBufferSize); - do { + + if (!g_nbSeconds) nbCompressionLoops=1; + for (nbLoops=0; nbLoops 0) { + if (loopDuration < fastestC * nbCompressionLoops) + fastestC = loopDuration / nbCompressionLoops; + nbCompressionLoops = (1000000000 / fastestC) + 1; + } else { + assert(nbCompressionLoops < 40000000); /* avoid overflow */ + nbCompressionLoops *= 100; } - nbLoops++; - } while (UTIL_clockSpanMicro(clockStart) < clockLoop); - { U64 const loopDuration = UTIL_clockSpanMicro(clockStart); - if (loopDuration < fastestC*nbLoops) - fastestC = loopDuration / nbLoops; totalCTime += loopDuration; cCompleted = (totalCTime >= maxTime); /* end compression tests */ } } @@ -358,7 +366,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, ratio = (double)srcSize / (double)cSize; markNb = (markNb+1) % NB_MARKS; { int const ratioAccuracy = (ratio < 10.) ? 3 : 2; - double const compressionSpeed = (double)srcSize / fastestC; + double const compressionSpeed = ((double)srcSize / fastestC) * 1000; int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1; DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s\r", marks[markNb], displayName, (U32)srcSize, (U32)cSize, @@ -376,16 +384,16 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, /* Decompression */ if (!dCompleted) memset(resultBuffer, 0xD6, srcSize); /* warm result buffer */ - UTIL_sleepMilli(1); /* give processor time to other processes */ + UTIL_sleepMilli(5); /* give processor time to other processes */ UTIL_waitForNextTick(); if (!dCompleted) { - U64 clockLoop = g_nbSeconds ? TIMELOOP_MICROSEC : 1; U32 nbLoops = 0; ZSTD_DDict* const ddict = ZSTD_createDDict(dictBuffer, dictBufferSize); UTIL_time_t const clockStart = UTIL_getTime(); if (!ddict) EXM_THROW(2, "ZSTD_createDDict() allocation failure"); - do { + if (!g_nbSeconds) nbDecodeLoops = 1; + for (nbLoops=0; nbLoops < nbDecodeLoops; nbLoops++) { U32 blockNb; for (blockNb=0; blockNb 0) { + if (loopDuration < fastestD * nbDecodeLoops) + fastestD = loopDuration / nbDecodeLoops; + nbDecodeLoops = (1000000000/*1sec*/ / fastestD) + 1; + } else { + assert(nbDecodeLoops < 40000000); /* avoid overflow */ + nbDecodeLoops *= 100; + } totalDTime += loopDuration; dCompleted = (totalDTime >= maxTime); } } markNb = (markNb+1) % NB_MARKS; { int const ratioAccuracy = (ratio < 10.) ? 3 : 2; - double const compressionSpeed = (double)srcSize / fastestC; + double const compressionSpeed = ((double)srcSize / fastestC) * 1000; int const cSpeedAccuracy = (compressionSpeed < 10.) ? 2 : 1; - double const decompressionSpeed = (double)srcSize / fastestD; + double const decompressionSpeed = ((double)srcSize / fastestD) * 1000; DISPLAYLEVEL(2, "%2s-%-17.17s :%10u ->%10u (%5.*f),%6.*f MB/s ,%6.1f MB/s \r", marks[markNb], displayName, (U32)srcSize, (U32)cSize, ratioAccuracy, ratio, @@ -461,8 +473,8 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, } /* for (testNb = 1; testNb <= (g_nbSeconds + !g_nbSeconds); testNb++) */ if (g_displayLevel == 1) { /* hidden display mode -q, used by python speed benchmark */ - double cSpeed = (double)srcSize / fastestC; - double dSpeed = (double)srcSize / fastestD; + double cSpeed = ((double)srcSize / fastestC) * 1000; + double dSpeed = ((double)srcSize / fastestD) * 1000; if (g_additionalParam) DISPLAY("-%-3i%11i (%5.3f) %6.2f MB/s %6.1f MB/s %s (param=%d)\n", cLevel, (int)cSize, ratio, cSpeed, dSpeed, displayName, g_additionalParam); else @@ -634,7 +646,8 @@ static void BMK_benchFileTable(const char* const * const fileNamesTable, unsigne } -static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility, const ZSTD_compressionParameters* compressionParams) +static void BMK_syntheticTest(int cLevel, int cLevelLast, double compressibility, + const ZSTD_compressionParameters* compressionParams) { char name[20] = {0}; size_t benchedSize = 10000000; diff --git a/programs/util.h b/programs/util.h index 4f22236a..3e697457 100644 --- a/programs/util.h +++ b/programs/util.h @@ -142,7 +142,9 @@ static int g_utilDisplayLevel; } return 1000000000ULL*(clockEnd.QuadPart - clockStart.QuadPart)/ticksPerSecond.QuadPart; } + #elif defined(__APPLE__) && defined(__MACH__) + #include #define UTIL_TIME_INITIALIZER 0 typedef U64 UTIL_time_t; @@ -167,7 +169,9 @@ static int g_utilDisplayLevel; } return ((clockEnd - clockStart) * (U64)rate.numer) / ((U64)rate.denom); } + #elif (PLATFORM_POSIX_VERSION >= 200112L) && (defined __UCLIBC__ || ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17) || __GLIBC__ > 2)) + #define UTIL_TIME_INITIALIZER { 0, 0 } typedef struct timespec UTIL_freq_t; typedef struct timespec UTIL_time_t; @@ -217,12 +221,18 @@ static int g_utilDisplayLevel; #define SEC_TO_MICRO 1000000 /* returns time span in microseconds */ -UTIL_STATIC U64 UTIL_clockSpanMicro( UTIL_time_t clockStart ) +UTIL_STATIC U64 UTIL_clockSpanMicro(UTIL_time_t clockStart ) { UTIL_time_t const clockEnd = UTIL_getTime(); return UTIL_getSpanTimeMicro(clockStart, clockEnd); } +/* returns time span in microseconds */ +UTIL_STATIC U64 UTIL_clockSpanNano(UTIL_time_t clockStart ) +{ + UTIL_time_t const clockEnd = UTIL_getTime(); + return UTIL_getSpanTimeNano(clockStart, clockEnd); +} UTIL_STATIC void UTIL_waitForNextTick(void) { From 3538a535bfe1532fb33bfa85bb2b70868506d682 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 20 Feb 2018 15:33:56 -0800 Subject: [PATCH 2/3] use TIMELOOP_NANOSEC as suggested by @terrelln --- programs/bench.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/bench.c b/programs/bench.c index e67c886b..5997cfbc 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -352,7 +352,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, if (loopDuration > 0) { if (loopDuration < fastestC * nbCompressionLoops) fastestC = loopDuration / nbCompressionLoops; - nbCompressionLoops = (1000000000 / fastestC) + 1; + nbCompressionLoops = (TIMELOOP_NANOSEC / fastestC) + 1; } else { assert(nbCompressionLoops < 40000000); /* avoid overflow */ nbCompressionLoops *= 100; @@ -411,7 +411,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, if (loopDuration > 0) { if (loopDuration < fastestD * nbDecodeLoops) fastestD = loopDuration / nbDecodeLoops; - nbDecodeLoops = (1000000000/*1sec*/ / fastestD) + 1; + nbDecodeLoops = (TIMELOOP_NANOSEC / fastestD) + 1; } else { assert(nbDecodeLoops < 40000000); /* avoid overflow */ nbDecodeLoops *= 100; From 25d00d10fc479c516206373571f75cc213f5415a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 20 Feb 2018 16:52:28 -0800 Subject: [PATCH 3/3] fixed minor conversion warning --- programs/bench.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/bench.c b/programs/bench.c index 5997cfbc..b4cd619c 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -352,7 +352,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, if (loopDuration > 0) { if (loopDuration < fastestC * nbCompressionLoops) fastestC = loopDuration / nbCompressionLoops; - nbCompressionLoops = (TIMELOOP_NANOSEC / fastestC) + 1; + nbCompressionLoops = (U32)(TIMELOOP_NANOSEC / fastestC) + 1; } else { assert(nbCompressionLoops < 40000000); /* avoid overflow */ nbCompressionLoops *= 100; @@ -411,7 +411,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, if (loopDuration > 0) { if (loopDuration < fastestD * nbDecodeLoops) fastestD = loopDuration / nbDecodeLoops; - nbDecodeLoops = (TIMELOOP_NANOSEC / fastestD) + 1; + nbDecodeLoops = (U32)(TIMELOOP_NANOSEC / fastestD) + 1; } else { assert(nbDecodeLoops < 40000000); /* avoid overflow */ nbDecodeLoops *= 100;