Merge pull request #2750 from senhuang42/sb_compress

Improve branch misses on FSE symbol spreading
This commit is contained in:
sen 2021-08-20 12:47:24 -04:00 committed by GitHub
commit ae998544de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 7 deletions

View File

@ -337,7 +337,7 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
* Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
* `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`. * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`.
*/ */
#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (maxSymbolValue + 2 + (1ull << (tableLog - 2))) #define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2)
#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);

View File

@ -75,9 +75,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
U32 const step = FSE_TABLESTEP(tableSize); U32 const step = FSE_TABLESTEP(tableSize);
U32 const maxSV1 = maxSymbolValue+1;
U32* cumul = (U32*)workSpace; U16* cumul = (U16*)workSpace; /* size = maxSV1 */
FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */
BYTE* spread = tableSymbol + tableSize; /* size = tableSize */
U32 highThreshold = tableSize-1; U32 highThreshold = tableSize-1;
@ -98,20 +100,59 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
/* symbol start positions */ /* symbol start positions */
{ U32 u; { U32 u;
cumul[0] = 0; cumul[0] = 0;
for (u=1; u <= maxSymbolValue+1; u++) { for (u=1; u <= maxSV1; u++) {
if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ if (normalizedCounter[u-1]==-1) { /* Low proba symbol */
cumul[u] = cumul[u-1] + 1; cumul[u] = cumul[u-1] + 1;
tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
} else { } else {
cumul[u] = cumul[u-1] + normalizedCounter[u-1]; cumul[u] = cumul[u-1] + normalizedCounter[u-1];
} } } }
cumul[maxSymbolValue+1] = tableSize+1; cumul[maxSV1] = (U16)(tableSize+1);
} }
/* Spread symbols */ /* Spread symbols */
{ U32 position = 0; if (highThreshold == tableSize - 1) {
/* Case for no low prob count symbols. Lay down 8 bytes at a time
* to reduce branch misses since we are operating on a small block
*/
{
U64 const add = 0x0101010101010101ull;
size_t pos = 0;
U64 sv = 0;
U32 s;
for (s=0; s<maxSV1; ++s, sv += add) {
int i;
int const n = normalizedCounter[s];
MEM_write64(spread + pos, sv);
for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
pos += n;
}
}
/* Spread symbols across the table. Lack of lowprob symbols means that
* we don't need variable sized inner loop, so we can unroll the loop and
* reduce branch misses.
*/
{
size_t position = 0;
size_t s;
size_t const unroll = 2; /* Experimentally determined optimal unroll */
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
for (s = 0; s < (size_t)tableSize; s += unroll) {
size_t u;
for (u = 0; u < unroll; ++u) {
size_t const uPosition = (position + (u * step)) & tableMask;
tableSymbol[uPosition] = spread[s + u];
}
position = (position + (unroll * step)) & tableMask;
}
assert(position == 0);
}
} else {
U32 position = 0;
U32 symbol; U32 symbol;
for (symbol=0; symbol<=maxSymbolValue; symbol++) { for (symbol=0; symbol<maxSV1; symbol++) {
int nbOccurrences; int nbOccurrences;
int const freq = normalizedCounter[symbol]; int const freq = normalizedCounter[symbol];
for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) { for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {