Merge branch 'bench' into largeNbDicts

2018-08-28 11:26:46 -07:00 · 2018-08-28 11:26:46 -07:00 · 0491037db9
parent 274b60e6e6 55affc09de
commit 0491037db9
6 changed files with 99 additions and 73 deletions
--- a/programs/bench.c
+++ b/programs/bench.c
@ -63,6 +63,8 @@
 #define MB *(1 <<20)
 #define GB *(1U<<30)

+#define BMK_RUNTEST_DEFAULT_MS 1000
+
 static const size_t maxMemory = (sizeof(size_t)==4)  ?
                    /* 32-bit */ (2 GB - 64 MB) :
                    /* 64-bit */ (size_t)(1ULL << ((sizeof(size_t)*8)-31));
@ -375,32 +377,37 @@ BMK_runOutcome_t BMK_benchFunction(
 struct BMK_timedFnState_s {
    U64 timeSpent_ns;
    U64 timeBudget_ns;
+    U64 runBudget_ns;
    BMK_runTime_t fastestRun;
    unsigned nbLoops;
    UTIL_time_t coolTime;
 };  /* typedef'd to BMK_timedFnState_t within bench.h */

-BMK_timedFnState_t* BMK_createTimedFnState(unsigned nbSeconds) {
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
+{
    BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
    if (r == NULL) return NULL;   /* malloc() error */
-    BMK_resetTimedFnState(r, nbSeconds);
+    BMK_resetTimedFnState(r, total_ms, run_ms);
    return r;
 }

-void BMK_resetTimedFnState(BMK_timedFnState_t* r, unsigned nbSeconds) {
-    r->timeSpent_ns = 0;
-    r->timeBudget_ns = (U64)nbSeconds * TIMELOOP_NANOSEC;
-    if (!nbSeconds) r->timeBudget_ns = 1;
-    r->fastestRun.nanoSecPerRun = (U64)(-1LL);
-    r->fastestRun.sumOfReturn = (size_t)(-1LL);
-    r->nbLoops = 1;
-    r->coolTime = UTIL_getTime();
-}
-
 void BMK_freeTimedFnState(BMK_timedFnState_t* state) {
    free(state);
 }

+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
+{
+    if (!total_ms) total_ms = 1 ;
+    if (!run_ms) run_ms = 1;
+    if (run_ms > total_ms) run_ms = total_ms;
+    timedFnState->timeSpent_ns = 0;
+    timedFnState->timeBudget_ns = (U64)total_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->runBudget_ns = (U64)run_ms * TIMELOOP_NANOSEC / 1000;
+    timedFnState->fastestRun.nanoSecPerRun = (U64)(-1LL);
+    timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
+    timedFnState->nbLoops = 1;
+    timedFnState->coolTime = UTIL_getTime();
+}

 /* Tells if nb of seconds set in timedFnState for all runs is spent.
 * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
@ -421,6 +428,8 @@ BMK_runOutcome_t BMK_benchTimedFn(
            void * const * dstBlockBuffers, const size_t * dstBlockCapacities,
            size_t* blockResults)
 {
+    U64 const runBudget_ns = cont->runBudget_ns;
+    U64 const runTimeMin_ns = runBudget_ns / 2;
    int completed = 0;
    BMK_runTime_t bestRunTime = cont->fastestRun;

@ -453,9 +462,9 @@ BMK_runOutcome_t BMK_benchTimedFn(
            cont->timeSpent_ns += loopDuration_ns;

            /* estimate nbLoops for next run to last approximately 1 second */
-            if (loopDuration_ns > (TIMELOOP_NANOSEC / 50)) {
+            if (loopDuration_ns > (runBudget_ns / 50)) {
                U64 const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
-                cont->nbLoops = (U32)(TIMELOOP_NANOSEC / fastestRun_ns) + 1;
+                cont->nbLoops = (U32)(runBudget_ns / fastestRun_ns) + 1;
            } else {
                /* previous run was too short : blindly increase workload by x multiplier */
                const unsigned multiplier = 10;
@ -463,7 +472,7 @@ BMK_runOutcome_t BMK_benchTimedFn(
                cont->nbLoops *= multiplier;
            }

-            if(loopDuration_ns < MINUSABLETIME) {
+            if(loopDuration_ns < runTimeMin_ns) {
                /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
                assert(completed == 0);
                continue;
@ -775,8 +784,8 @@ BMK_benchOutcome_t BMK_benchMemAdvanced(const void* srcBuffer, size_t srcSize,
    void ** const resPtrs = (void**)malloc(maxNbBlocks * sizeof(void*));
    size_t* const resSizes = (size_t*)malloc(maxNbBlocks * sizeof(size_t));

-    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds);
-    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds);
+    BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);
+    BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(adv->nbSeconds * 1000, BMK_RUNTEST_DEFAULT_MS);

    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
--- a/programs/bench.h
+++ b/programs/bench.h
@ -255,24 +255,33 @@ BMK_runOutcome_t BMK_benchFunction(



-/* ====  Benchmarking any function, providing intermediate results  ==== */
+/* ====  Benchmark any function, providing intermediate results  ==== */

-/* state information needed by benchFunctionTimed */
+/* state information tracking benchmark session */
 typedef struct BMK_timedFnState_s BMK_timedFnState_t;

-BMK_timedFnState_t* BMK_createTimedFnState(unsigned nbSeconds);
-void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned nbSeconds);
+/* BMK_createTimedFnState() and BMK_resetTimedFnState() :
+ * Create/Set BMK_timedFnState_t for next benchmark session,
+ * which shall last a minimum of total_ms milliseconds,
+ * producing intermediate results, paced at interval of (approximately) run_ms.
+ */
+BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms);
+void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms);
 void BMK_freeTimedFnState(BMK_timedFnState_t* state);


+/* Tells if duration of all benchmark runs has exceeded total_ms
+ */
+int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);
+
+
 /* BMK_benchTimedFn() :
- * Similar to BMK_benchFunction(),
- * tries to find automatically `nbLoops`, so that each run lasts approximately 1 second.
- * Note : minimum `nbLoops` is 1, a run may last more than 1 second if benchFn is slow.
- * Most arguments are the same as BMK_benchFunction()
- * Usage - initialize a timedFnState, selecting a total nbSeconds allocated for _all_ benchmarks run
- *         call BMK_benchTimedFn() repetitively, collecting intermediate results (each run is supposed to last about 1 seconds)
- *         Check if time budget is spent using BMK_isCompleted_TimedFn()
+ * Similar to BMK_benchFunction(), most arguments being identical.
+ * Automatically determines `nbLoops` so that each result is regularly produced at interval of about run_ms.
+ * Note : minimum `nbLoops` is 1, therefore a run may last more than run_ms, and possibly even more than total_ms.
+ * Usage - initialize timedFnState, select benchmark duration (total_ms) and each measurement duration (run_ms)
+ *         call BMK_benchTimedFn() repetitively, each measurement is supposed to last about run_ms
+ *         Check if total time budget is spent or exceeded, using BMK_isCompleted_TimedFn()
 */
 BMK_runOutcome_t BMK_benchTimedFn(
                    BMK_timedFnState_t* timedFnState,
@ -284,9 +293,6 @@ BMK_runOutcome_t BMK_benchTimedFn(
                    size_t* blockResults);


-/* Tells if total nb of benchmark runs has exceeded amount of time set in timedFnState
- */
-int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState);



--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -846,13 +846,13 @@ int main(int argCount, const char* argv[])
        if (cLevelLast > ZSTD_maxCLevel()) cLevelLast = ZSTD_maxCLevel();
        if (cLevelLast < cLevel) cLevelLast = cLevel;
        if (cLevelLast > cLevel)
-            DISPLAYLEVEL(2, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
+            DISPLAYLEVEL(3, "Benchmarking levels from %d to %d\n", cLevel, cLevelLast);
        if(filenameIdx) {
            if(separateFiles) {
                unsigned i;
                for(i = 0; i < filenameIdx; i++) {
                    int c;
-                    DISPLAYLEVEL(2, "Benchmarking %s \n", filenameTable[i]);
+                    DISPLAYLEVEL(3, "Benchmarking %s \n", filenameTable[i]);
                    for(c = cLevel; c <= cLevelLast; c++) {
                        BMK_benchFilesAdvanced(&filenameTable[i], 1, dictFileName, c, &compressionParams, g_displayLevel, &benchParams);
                    }
--- a/tests/Makefile
+++ b/tests/Makefile
@ -200,7 +200,7 @@ zstreamtest-dll : $(ZSTDDIR)/common/xxhash.c  # xxh symbols not exposed from dll
 zstreamtest-dll : $(ZSTREAM_LOCAL_FILES)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)

-paramgrill : DEBUGFLAGS = -DNDEBUG  # turn off assert() for speed measurements
+paramgrill : DEBUGFLAGS =  # turn off assert() by default for speed measurements
 paramgrill : $(ZSTD_FILES) $(PRGDIR)/bench.c $(PRGDIR)/datagen.c paramgrill.c
 	$(CC) $(FLAGS) $^ -lm -o $@$(EXT)

--- a/tests/fullbench.c
+++ b/tests/fullbench.c
@ -514,10 +514,11 @@ static size_t benchMem(U32 benchNb,
    { size_t i; for (i=0; i<dstBuffSize; i++) dstBuff[i]=(BYTE)i; }

    /* benchmark loop */
-    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations);
+    {   BMK_timedFnState_t* const tfs = BMK_createTimedFnState(g_nbIterations * 1000, 1000);
        BMK_runTime_t bestResult;
        bestResult.sumOfReturn = 0;
        bestResult.nanoSecPerRun = (unsigned long long)(-1LL);
+        assert(tfs != NULL);
        for (;;) {
            void* const dstBuffv = dstBuff;
            BMK_runOutcome_t const bOutcome =
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@ -468,7 +468,7 @@ static void paramVariation(paramValues_t* ptr, memoTable_t* mtAll, const U32 nbC
 static paramValues_t randomParams(void)
 {
    varInds_t v; paramValues_t p;
-    for(v = 0; v <= NUM_PARAMS; v++) {
+    for(v = 0; v < NUM_PARAMS; v++) {
        p.vals[v] = rangeMap(v, FUZ_rand(&g_rand) % rangetable[v]);
    }
    return p;
@ -632,32 +632,39 @@ static void BMK_translateAdvancedParams(FILE* f, const paramValues_t params) {
    varInds_t v;
    int first = 1;
    fprintf(f,"--zstd=");
-    for(v = 0; v < NUM_PARAMS; v++) {
-        if(g_silenceParams[v]) { continue; }
-        if(!first) { fprintf(f, ","); }
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
        fprintf(f,"%s=", g_paramNames[v]);

-        if(v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
+        if (v == strt_ind) { fprintf(f,"%u", params.vals[v]); }
        else { displayParamVal(f, v, params.vals[v], 0); }
        first = 0;
    }
    fprintf(f, "\n");
 }

-static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize) {
-            varInds_t v;
-            int first = 1;
-            res.params = cParamUnsetMin(res.params);
-            fprintf(f,"    {");
-            for(v = 0; v < NUM_PARAMS; v++) {
-                if(g_silenceParams[v]) { continue; }
-                if(!first) { fprintf(f, ","); }
-                displayParamVal(f, v, res.params.vals[v], 3);
-                first = 0;
-            }
+static void BMK_displayOneResult(FILE* f, winnerInfo_t res, const size_t srcSize)
+{
+    varInds_t v;
+    int first = 1;
+    res.params = cParamUnsetMin(res.params);
+    fprintf(f, "    {");
+    for (v = 0; v < NUM_PARAMS; v++) {
+        if (g_silenceParams[v]) { continue; }
+        if (!first) { fprintf(f, ","); }
+        displayParamVal(f, v, res.params.vals[v], 3);
+        first = 0;
+    }

-            fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
-            (double)srcSize / res.result.cSize, (double)res.result.cSpeed / MB_UNIT, (double)res.result.dSpeed / MB_UNIT);
+    {   double const ratio = res.result.cSize ?
+                            (double)srcSize / res.result.cSize : 0;
+        double const cSpeedMBps = (double)res.result.cSpeed / MB_UNIT;
+        double const dSpeedMBps = (double)res.result.dSpeed / MB_UNIT;
+
+        fprintf(f, " },     /* R:%5.3f at %5.1f MB/s - %5.1f MB/s */\n",
+                            ratio, cSpeedMBps, dSpeedMBps);
+    }
 }

 /* Writes to f the results of a parameter benchmark */
@ -1427,8 +1434,8 @@ BMK_benchMemInvertible( buffers_t buf, contexts_t ctx,
        /* init args */
        int compressionCompleted = (mode == BMK_decodeOnly);
        int decompressionCompleted = (mode == BMK_compressOnly);
-        BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(nbSeconds);
-        BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(nbSeconds);
+        BMK_timedFnState_t* timeStateCompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
+        BMK_timedFnState_t* timeStateDecompress = BMK_createTimedFnState(nbSeconds * 1000, 1000);
        BMK_initCCtxArgs cctxprep;
        BMK_initDCtxArgs dctxprep;
        cctxprep.cctx = cctx;
@ -1440,6 +1447,8 @@ BMK_benchMemInvertible( buffers_t buf, contexts_t ctx,
        dctxprep.dictBuffer = dictBuffer;
        dctxprep.dictBufferSize = dictBufferSize;

+        assert(timeStateCompress != NULL);
+        assert(timeStateDecompress != NULL);
        while(!compressionCompleted) {
            BMK_runOutcome_t const cOutcome = BMK_benchTimedFn(timeStateCompress,
                                            &local_defaultCompress, cctx,
@ -1540,12 +1549,13 @@ static int allBench(BMK_benchResult_t* resultPtr,
                const constraint_t target,
                BMK_benchResult_t* winnerResult, int feas)
 {
-    BMK_benchResult_t resultMax, benchres;
+    BMK_benchResult_t benchres;
    U64 loopDurationC = 0, loopDurationD = 0;
    double uncertaintyConstantC = 3., uncertaintyConstantD = 3.;
    double winnerRS;
+
    /* initial benchmarking, gives exact ratio and memory, warms up future runs */
-    CBENCHMARK(1, benchres, tmp, BMK_both, 1);
+    CBENCHMARK(1, benchres, tmp, BMK_both, 2);

    winnerRS = resultScore(*winnerResult, buf.srcSize, target);
    DEBUGOUTPUT("WinnerScore: %f\n ", winnerRS);
@ -1554,12 +1564,12 @@ static int allBench(BMK_benchResult_t* resultPtr,

    /* calculate uncertainty in compression / decompression runs */
    if(benchres.cSpeed) {
-        loopDurationC = ((buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed);
+        loopDurationC = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.cSpeed);
        uncertaintyConstantC = ((loopDurationC + (double)(2 * g_clockGranularity))/loopDurationC);
    }

    if(benchres.dSpeed) {
-        loopDurationD = ((buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed);
+        loopDurationD = (((U64)buf.srcSize * TIMELOOP_NANOSEC) / benchres.dSpeed);
        uncertaintyConstantD = ((loopDurationD + (double)(2 * g_clockGranularity))/loopDurationD);
    }

@ -1568,27 +1578,25 @@ static int allBench(BMK_benchResult_t* resultPtr,
        return WORSE_RESULT;
    }

-    /* second run, if first run is too short, gives approximate cSpeed + dSpeed */
-    CBENCHMARK(loopDurationC < TIMELOOP_NANOSEC / 10, benchres, tmp, BMK_compressOnly, 1);
-    CBENCHMARK(loopDurationD < TIMELOOP_NANOSEC / 10, benchres, tmp, BMK_decodeOnly,   1);
+    /* ensure all measurements last a minimum time, to reduce measurement errors */
+    assert(loopDurationC >= TIMELOOP_NANOSEC / 10);
+    assert(loopDurationD >= TIMELOOP_NANOSEC / 10);

    *resultPtr = benchres;

    /* optimistic assumption of benchres */
-    resultMax = benchres;
-    resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
-    resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;
+    {   BMK_benchResult_t resultMax = benchres;
+        resultMax.cSpeed *= uncertaintyConstantC * VARIANCE;
+        resultMax.dSpeed *= uncertaintyConstantD * VARIANCE;

-    /* disregard infeasible results in feas mode */
-    /* disregard if resultMax < winner in infeas mode */
-    if((feas && !feasible(resultMax, target)) ||
-      (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
-        return WORSE_RESULT;
+        /* disregard infeasible results in feas mode */
+        /* disregard if resultMax < winner in infeas mode */
+        if((feas && !feasible(resultMax, target)) ||
+          (!feas && (winnerRS > resultScore(resultMax, buf.srcSize, target)))) {
+            return WORSE_RESULT;
+        }
    }

-    CBENCHMARK(loopDurationC < TIMELOOP_NANOSEC, benchres, tmp, BMK_compressOnly, 1);
-    CBENCHMARK(loopDurationD < TIMELOOP_NANOSEC, benchres, tmp, BMK_decodeOnly,   1);
-
    *resultPtr = benchres;

    /* compare by resultScore when in infeas */
@ -1601,6 +1609,7 @@ static int allBench(BMK_benchResult_t* resultPtr,
    }
 }

+
 #define INFEASIBLE_THRESHOLD 200
 /* Memoized benchmarking, won't benchmark anything which has already been benchmarked before. */
 static int benchMemo(BMK_benchResult_t* resultPtr,
@ -1628,6 +1637,7 @@ static int benchMemo(BMK_benchResult_t* resultPtr,
    return res;
 }

+
 typedef struct {
    U64 cSpeed_min;
    U64 dSpeed_min;